[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250306054532.221138-4-bharata@amd.com>
Date: Thu, 6 Mar 2025 11:15:31 +0530
From: Bharata B Rao <bharata@....com>
To: <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC: <AneeshKumar.KizhakeVeetil@....com>, <Hasan.Maruf@....com>,
<Jonathan.Cameron@...wei.com>, <Michael.Day@....com>,
<akpm@...ux-foundation.org>, <dave.hansen@...el.com>, <david@...hat.com>,
<feng.tang@...el.com>, <gourry@...rry.net>, <hannes@...xchg.org>,
<honggyu.kim@...com>, <hughd@...gle.com>, <jhubbard@...dia.com>,
<k.shutemov@...il.com>, <kbusch@...a.com>, <kmanaouil.dev@...il.com>,
<leesuyeon0506@...il.com>, <leillc@...gle.com>, <liam.howlett@...cle.com>,
<mgorman@...hsingularity.net>, <mingo@...hat.com>, <nadav.amit@...il.com>,
<nphamcs@...il.com>, <peterz@...radead.org>, <raghavendra.kt@....com>,
<riel@...riel.com>, <rientjes@...gle.com>, <rppt@...nel.org>,
<shivankg@....com>, <shy828301@...il.com>, <sj@...nel.org>, <vbabka@...e.cz>,
<weixugc@...gle.com>, <willy@...radead.org>, <ying.huang@...ux.alibaba.com>,
<ziy@...dia.com>, <dave@...olabs.net>, <yuanchu@...gle.com>,
<hyeonggon.yoo@...com>, Bharata B Rao <bharata@....com>
Subject: [RFC PATCH 3/4] x86: ibs: In-kernel IBS driver for memory access profiling
Use IBS (Instruction Based Sampling) feature present
in AMD processors for memory access tracking. The access
information obtained from IBS via NMI is fed to kpromoted
daemon for futher action.
In addition to many other information related to the memory
access, IBS provides physical (and virtual) address of the access
and indicates if the access came from slower tier. Only memory
accesses originating from slower tiers are further acted upon
by this driver.
The samples are initially accumulated in percpu buffers which
are flushed to kpromoted using irq_work.
About IBS
---------
IBS can be programmed to provide data about instruction
execution periodically. This is done by programming a desired
sample count (number of ops) in a control register. When the
programmed number of ops are dispatched, a micro-op gets tagged,
various information about the tagged micro-op's execution is
populated in IBS execution MSRs and an interrupt is raised.
While IBS provides a lot of data for each sample, for the
purpose of memory access profiling, we are interested in
linear and physical address of the memory access that reached
DRAM. Recent AMD processors provide further filtering where
it is possible to limit the sampling to those ops that had
an L3 miss which greately reduces the non-useful samples.
While IBS provides capability to sample instruction fetch
and execution, only IBS execution sampling is used here
to collect data about memory accesses that occur during
the instruction execution.
More information about IBS is available in Sec 13.3 of
AMD64 Architecture Programmer's Manual, Volume 2:System
Programming which is present at:
https://bugzilla.kernel.org/attachment.cgi?id=288923
Information about MSRs used for programming IBS can be
found in Sec 2.1.14.4 of PPR Vol 1 for AMD Family 19h
Model 11h B1 which is currently present at:
https://www.amd.com/system/files/TechDocs/55901_0.25.zip
Signed-off-by: Bharata B Rao <bharata@....com>
---
arch/x86/events/amd/ibs.c | 11 ++
arch/x86/include/asm/ibs.h | 7 +
arch/x86/include/asm/msr-index.h | 16 ++
arch/x86/mm/Makefile | 3 +-
arch/x86/mm/ibs.c | 312 +++++++++++++++++++++++++++++++
include/linux/vm_event_item.h | 17 ++
mm/vmstat.c | 17 ++
7 files changed, 382 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/include/asm/ibs.h
create mode 100644 arch/x86/mm/ibs.c
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index e7a8b8758e08..35497e8c0846 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -13,8 +13,10 @@
#include <linux/ptrace.h>
#include <linux/syscore_ops.h>
#include <linux/sched/clock.h>
+#include <linux/kpromoted.h>
#include <asm/apic.h>
+#include <asm/ibs.h>
#include "../perf_event.h"
@@ -1539,6 +1541,15 @@ static __init int amd_ibs_init(void)
{
u32 caps;
+ /*
+ * TODO: Find a clean way to disable perf IBS so that IBS
+ * can be used for memory access profiling.
+ */
+ if (arch_hw_access_profiling) {
+ pr_info("IBS isn't available for perf use\n");
+ return 0;
+ }
+
caps = __get_ibs_caps();
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
diff --git a/arch/x86/include/asm/ibs.h b/arch/x86/include/asm/ibs.h
new file mode 100644
index 000000000000..b5a4f2ca6330
--- /dev/null
+++ b/arch/x86/include/asm/ibs.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IBS_H
+#define _ASM_X86_IBS_H
+
+extern bool arch_hw_access_profiling;
+
+#endif /* _ASM_X86_IBS_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 72765b2fe0d8..12291e362b01 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -719,6 +719,22 @@
/* AMD Last Branch Record MSRs */
#define MSR_AMD64_LBR_SELECT 0xc000010e
+/* AMD IBS MSR bits */
+#define MSR_AMD64_IBSOPDATA2_DATASRC 0x7
+#define MSR_AMD64_IBSOPDATA2_DATASRC_LCL_CACHE 0x1
+#define MSR_AMD64_IBSOPDATA2_DATASRC_PEER_CACHE_NEAR 0x2
+#define MSR_AMD64_IBSOPDATA2_DATASRC_DRAM 0x3
+#define MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE 0x5
+#define MSR_AMD64_IBSOPDATA2_DATASRC_EXT_MEM 0x8
+#define MSR_AMD64_IBSOPDATA2_RMTNODE 0x10
+
+#define MSR_AMD64_IBSOPDATA3_LDOP BIT_ULL(0)
+#define MSR_AMD64_IBSOPDATA3_STOP BIT_ULL(1)
+#define MSR_AMD64_IBSOPDATA3_DCMISS BIT_ULL(7)
+#define MSR_AMD64_IBSOPDATA3_LADDR_VALID BIT_ULL(17)
+#define MSR_AMD64_IBSOPDATA3_PADDR_VALID BIT_ULL(18)
+#define MSR_AMD64_IBSOPDATA3_L2MISS BIT_ULL(20)
+
/* Zen4 */
#define MSR_ZEN4_BP_CFG 0xc001102e
#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 690fbf48e853..3b1a5dbbac64 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -26,7 +26,8 @@ CFLAGS_REMOVE_pgprot.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
- pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o
+ pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o \
+ ibs.o
obj-y += pat/
diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
new file mode 100644
index 000000000000..5c966050ad86
--- /dev/null
+++ b/arch/x86/mm/ibs.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/kpromoted.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <linux/irq_work.h>
+
+#include <asm/nmi.h>
+#include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
+#include <asm/apic.h>
+#include <asm/ibs.h>
+
+bool arch_hw_access_profiling;
+static u64 ibs_config __read_mostly;
+static u32 ibs_caps;
+
+#define IBS_NR_SAMPLES 50
+
+/*
+ * Basic access info captured for each memory access.
+ */
+struct ibs_sample {
+ unsigned long pfn;
+ unsigned long time; /* jiffies when accessed */
+ int nid; /* Accessing node ID, if known */
+};
+
+/*
+ * Percpu buffer of access samples. Samples are accumulated here
+ * before pushing them to kpromoted for further action.
+ */
+struct ibs_sample_pcpu {
+ struct ibs_sample samples[IBS_NR_SAMPLES];
+ int head, tail;
+};
+
+struct ibs_sample_pcpu __percpu *ibs_s;
+
+/*
+ * The workqueue for pushing the percpu access samples to kpromoted.
+ */
+static struct work_struct ibs_work;
+static struct irq_work ibs_irq_work;
+
+/*
+ * Record the IBS-reported access sample in percpu buffer.
+ * Called from IBS NMI handler.
+ */
+static int ibs_push_sample(unsigned long pfn, int nid, unsigned long time)
+{
+ struct ibs_sample_pcpu *ibs_pcpu = raw_cpu_ptr(ibs_s);
+ int next = ibs_pcpu->head + 1;
+
+ if (next >= IBS_NR_SAMPLES)
+ next = 0;
+
+ if (next == ibs_pcpu->tail)
+ return 0;
+
+ ibs_pcpu->samples[ibs_pcpu->head].pfn = pfn;
+ ibs_pcpu->samples[ibs_pcpu->head].time = time;
+ ibs_pcpu->head = next;
+ return 1;
+}
+
+static int ibs_pop_sample(struct ibs_sample *s)
+{
+ struct ibs_sample_pcpu *ibs_pcpu = raw_cpu_ptr(ibs_s);
+
+ int next = ibs_pcpu->tail + 1;
+
+ if (ibs_pcpu->head == ibs_pcpu->tail)
+ return 0;
+
+ if (next >= IBS_NR_SAMPLES)
+ next = 0;
+
+ *s = ibs_pcpu->samples[ibs_pcpu->tail];
+ ibs_pcpu->tail = next;
+ return 1;
+}
+
+/*
+ * Remove access samples from percpu buffer and send them
+ * to kpromoted for further action.
+ */
+static void ibs_work_handler(struct work_struct *work)
+{
+ struct ibs_sample s;
+
+ while (ibs_pop_sample(&s))
+ kpromoted_record_access(s.pfn, s.nid, KPROMOTED_HW_HINTS,
+ s.time);
+}
+
+static void ibs_irq_handler(struct irq_work *i)
+{
+ schedule_work_on(smp_processor_id(), &ibs_work);
+}
+
+/*
+ * IBS NMI handler: Process the memory access info reported by IBS.
+ *
+ * Reads the MSRs to collect all the information about the reported
+ * memory access, validates the access, stores the valid sample and
+ * schedules the work on this CPU to further process the sample.
+ */
+static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ struct mm_struct *mm = current->mm;
+ u64 ops_ctl, ops_data3, ops_data2;
+ u64 laddr = -1, paddr = -1;
+ u64 data_src, rmt_node;
+ struct page *page;
+ unsigned long pfn;
+
+ rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl);
+
+ /*
+ * When IBS sampling period is reprogrammed via read-modify-update
+ * of MSR_AMD64_IBSOPCTL, overflow NMIs could be generated with
+ * IBS_OP_ENABLE not set. For such cases, return as HANDLED.
+ *
+ * With this, the handler will say "handled" for all NMIs that
+ * aren't related to this NMI. This stems from the limitation of
+ * having both status and control bits in one MSR.
+ */
+ if (!(ops_ctl & IBS_OP_VAL))
+ goto handled;
+
+ wrmsrl(MSR_AMD64_IBSOPCTL, ops_ctl & ~IBS_OP_VAL);
+
+ count_vm_event(HWHINT_NR_EVENTS);
+
+ if (!user_mode(regs)) {
+ count_vm_event(HWHINT_KERNEL);
+ goto handled;
+ }
+
+ if (!mm) {
+ count_vm_event(HWHINT_KTHREAD);
+ goto handled;
+ }
+
+ rdmsrl(MSR_AMD64_IBSOPDATA3, ops_data3);
+
+ /* Load/Store ops only */
+ /* TODO: DataSrc isn't valid for stores, so filter out stores? */
+ if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_LDOP |
+ MSR_AMD64_IBSOPDATA3_STOP))) {
+ count_vm_event(HWHINT_NON_LOAD_STORES);
+ goto handled;
+ }
+
+ /* Discard the sample if it was L1 or L2 hit */
+ if (!(ops_data3 & (MSR_AMD64_IBSOPDATA3_DCMISS |
+ MSR_AMD64_IBSOPDATA3_L2MISS))) {
+ count_vm_event(HWHINT_DC_L2_HITS);
+ goto handled;
+ }
+
+ rdmsrl(MSR_AMD64_IBSOPDATA2, ops_data2);
+ data_src = ops_data2 & MSR_AMD64_IBSOPDATA2_DATASRC;
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ data_src |= ((ops_data2 & 0xC0) >> 3);
+
+ switch (data_src) {
+ case MSR_AMD64_IBSOPDATA2_DATASRC_LCL_CACHE:
+ count_vm_event(HWHINT_LOCAL_L3L1L2);
+ break;
+ case MSR_AMD64_IBSOPDATA2_DATASRC_PEER_CACHE_NEAR:
+ count_vm_event(HWHINT_LOCAL_PEER_CACHE_NEAR);
+ break;
+ case MSR_AMD64_IBSOPDATA2_DATASRC_DRAM:
+ count_vm_event(HWHINT_DRAM_ACCESSES);
+ break;
+ case MSR_AMD64_IBSOPDATA2_DATASRC_EXT_MEM:
+ count_vm_event(HWHINT_CXL_ACCESSES);
+ break;
+ case MSR_AMD64_IBSOPDATA2_DATASRC_FAR_CCX_CACHE:
+ count_vm_event(HWHINT_FAR_CACHE_HITS);
+ break;
+ }
+
+ rmt_node = ops_data2 & MSR_AMD64_IBSOPDATA2_RMTNODE;
+ if (rmt_node)
+ count_vm_event(HWHINT_REMOTE_NODE);
+
+ /* Is linear addr valid? */
+ if (ops_data3 & MSR_AMD64_IBSOPDATA3_LADDR_VALID)
+ rdmsrl(MSR_AMD64_IBSDCLINAD, laddr);
+ else {
+ count_vm_event(HWHINT_LADDR_INVALID);
+ goto handled;
+ }
+
+ /* Discard kernel address accesses */
+ if (laddr & (1UL << 63)) {
+ count_vm_event(HWHINT_KERNEL_ADDR);
+ goto handled;
+ }
+
+ /* Is phys addr valid? */
+ if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID)
+ rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr);
+ else {
+ count_vm_event(HWHINT_PADDR_INVALID);
+ goto handled;
+ }
+
+ pfn = PHYS_PFN(paddr);
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ goto handled;
+
+ if (!PageLRU(page)) {
+ count_vm_event(HWHINT_NON_LRU);
+ goto handled;
+ }
+
+ if (!ibs_push_sample(pfn, numa_node_id(), jiffies)) {
+ count_vm_event(HWHINT_BUFFER_FULL);
+ goto handled;
+ }
+
+ irq_work_queue(&ibs_irq_work);
+ count_vm_event(HWHINT_USEFUL_SAMPLES);
+
+handled:
+ return NMI_HANDLED;
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int x86_amd_ibs_access_profile_startup(unsigned int cpu)
+{
+ setup_APIC_ibs();
+ return 0;
+}
+
+static int x86_amd_ibs_access_profile_teardown(unsigned int cpu)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+static int __init ibs_access_profiling_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBS)) {
+ pr_info("IBS capability is unavailable for access profiling\n");
+ return 0;
+ }
+
+ ibs_s = alloc_percpu_gfp(struct ibs_sample_pcpu, __GFP_ZERO);
+ if (!ibs_s)
+ return 0;
+
+ INIT_WORK(&ibs_work, ibs_work_handler);
+ init_irq_work(&ibs_irq_work, ibs_irq_handler);
+
+ /* Uses IBS Op sampling */
+ ibs_config = IBS_OP_CNT_CTL | IBS_OP_ENABLE;
+ ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ ibs_config |= IBS_OP_L3MISSONLY;
+
+ register_nmi_handler(NMI_LOCAL, ibs_overflow_handler, 0, "ibs");
+
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+ "x86/amd/ibs_access_profile:starting",
+ x86_amd_ibs_access_profile_startup,
+ x86_amd_ibs_access_profile_teardown);
+
+ pr_info("IBS setup for memory access profiling\n");
+ return 0;
+}
+
+arch_initcall(ibs_access_profiling_init);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index b5823b037883..24279c46054c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -195,6 +195,23 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
KPROMOTED_MIG_CANDIDATE,
KPROMOTED_MIG_PROMOTED,
KPROMOTED_MIG_DROPPED,
+ HWHINT_NR_EVENTS,
+ HWHINT_KERNEL,
+ HWHINT_KTHREAD,
+ HWHINT_NON_LOAD_STORES,
+ HWHINT_DC_L2_HITS,
+ HWHINT_LOCAL_L3L1L2,
+ HWHINT_LOCAL_PEER_CACHE_NEAR,
+ HWHINT_FAR_CACHE_HITS,
+ HWHINT_DRAM_ACCESSES,
+ HWHINT_CXL_ACCESSES,
+ HWHINT_REMOTE_NODE,
+ HWHINT_LADDR_INVALID,
+ HWHINT_KERNEL_ADDR,
+ HWHINT_PADDR_INVALID,
+ HWHINT_NON_LRU,
+ HWHINT_BUFFER_FULL,
+ HWHINT_USEFUL_SAMPLES,
NR_VM_EVENT_ITEMS
};
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 618f44bae5c8..a21d3118d6f6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1479,6 +1479,23 @@ const char * const vmstat_text[] = {
"kpromoted_mig_candidate",
"kpromoted_mig_promoted",
"kpromoted_mig_dropped",
+ "hwhint_nr_events",
+ "hwhint_kernel",
+ "hwhint_kthread",
+ "hwhint_non_load_stores",
+ "hwhint_dc_l2_hits",
+ "hwhint_local_l3l1l2",
+ "hwhint_local_peer_cache_near",
+ "hwhint_far_cache_hits",
+ "hwhint_dram_accesses",
+ "hwhint_cxl_accesses",
+ "hwhint_remote_node",
+ "hwhint_invalid_laddr",
+ "hwhint_kernel_addr",
+ "hwhint_invalid_paddr",
+ "hwhint_non_lru",
+ "hwhint_buffer_full",
+ "hwhint_useful_samples",
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.34.1
Powered by blists - more mailing lists