linux-kernel - [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <2bf90acf7d29641ba6643934ff8dbba897dbd2d9.1718873074.git.jialong.yang@shingroup.cn>
Date: Thu, 20 Jun 2024 16:51:17 +0800
From: Jialong Yang <jialong.yang@...ngroup.cn>
To: Michael Ellerman <mpe@...erman.id.au>,
	Nicholas Piggin <npiggin@...il.com>,
	Christophe Leroy <christophe.leroy@...roup.eu>,
	"Naveen N. Rao" <naveen.n.rao@...ux.ibm.com>
Cc: luming.yu@...ngroup.cn,
	shenghui.qu@...ngroup.cn,
	Jialong Yang <jialong.yang@...ngroup.cn>,
	linuxppc-dev@...ts.ozlabs.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH v1 1/2] powerpc/mmiotrace: Add MMIO Tracing tool for PowerPC

mmiotrace is a useful tool to trace MMIO accesses. Nowadays, it only
supported on x86 and x86_64 platforms. Here is a support for powerpc.
The manual is located at Documentation/trace/mmiotrace.rst which means
I have not changed user API. People will be easy to use it.
Almost all files are copied from x86/mm, there are only some
differences from hardware and architectures software.

LINK: https://lore.kernel.org/lkml/20080127195536.50809974@daedalus.pq.iki.fi/

Signed-off-by: Jialong Yang <jialong.yang@...ngroup.cn>
---
 arch/powerpc/Kconfig.debug       |   3 +
 arch/powerpc/mm/Makefile         |   1 +
 arch/powerpc/mm/kmmio.c          | 649 +++++++++++++++++++++++++++++++
 arch/powerpc/mm/mmio-mod.c       | 414 ++++++++++++++++++++
 arch/powerpc/mm/mmiotrace_arch.c | 149 +++++++
 arch/powerpc/mm/mmiotrace_arch.h |  25 ++
 arch/powerpc/mm/pf_in.c          | 185 +++++++++
 arch/powerpc/mm/pf_in.h          |  33 ++
 8 files changed, 1459 insertions(+)
 create mode 100644 arch/powerpc/mm/kmmio.c
 create mode 100644 arch/powerpc/mm/mmio-mod.c
 create mode 100644 arch/powerpc/mm/mmiotrace_arch.c
 create mode 100644 arch/powerpc/mm/mmiotrace_arch.h
 create mode 100644 arch/powerpc/mm/pf_in.c
 create mode 100644 arch/powerpc/mm/pf_in.h

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 8c80b154e814..8a69188aa75a 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
+config HAVE_MMIOTRACE_SUPPORT
+	def_bool y
+
 config PPC_DISABLE_WERROR
 	bool "Don't build arch/powerpc code with -Werror"
 	help
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0fe2f085c05a..cb92049f1239 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
+obj-$(CONFIG_MMIOTRACE) += kmmio.o mmio-mod.o pf_in.o mmiotrace_arch.o
diff --git a/arch/powerpc/mm/kmmio.c b/arch/powerpc/mm/kmmio.c
new file mode 100644
index 000000000000..f4374e721b37
--- /dev/null
+++ b/arch/powerpc/mm/kmmio.c
@@ -0,0 +1,649 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Support for MMIO probes.
+ * Derived from arch/x86/mm/kmmio.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/paca.h>
+#include <linux/errno.h>
+#include <linux/mmiotrace.h>
+
+#include "mmiotrace_arch.h"
+
+typedef unsigned long	pteval_t;
+typedef unsigned long	pmdval_t;
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long addr; /* the requested address */
+	pteval_t old_presence; /* page presence prior to arming */
+	bool armed;
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU) and post_kmmio_handler().
+	 * Protected by kmmio_lock, when linked into kmmio_page_table.
+	 */
+	int count;
+
+	bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	unsigned long saved_softe;
+	unsigned long addr;
+	int active;
+};
+
+/*
+ * The kmmio_lock is taken in int3 context, which is treated as NMI context.
+ * This causes lockdep to complain about it bein in both NMI and normal
+ * context. Hide it from lockdep, as it should not have any other locks
+ * taken under it, and this is only enabled for debugging mmio anyway.
+ */
+static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long addr)
+{
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+
+	return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr < (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
+{
+	struct list_head *head;
+	struct kmmio_fault_page *f;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+	head = kmmio_page_list(addr);
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->addr == addr)
+			return f;
+	}
+	return NULL;
+}
+
+static inline pmd_t pmd_mkinvalid(pmd_t pmd)
+{
+	return __pmd_raw(pmd_raw(pmd) & ~cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+	pmd_t new_pmd;
+	pmdval_t v = pmd_val(*pmd);
+
+	if (clear) {
+		*old = v;
+		new_pmd = pmd_mkinvalid(*pmd);
+	} else {
+		/* Presume this has been called with clear==true previously */
+		new_pmd = __pmd(*old);
+	}
+	*pmd = new_pmd;
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old, unsigned long addr)
+{
+	pteval_t v = pte_val(*pte);
+
+	if (clear) {
+		*old = v;
+		/* Nothing should care about address */
+		pte_clear(&init_mm, addr, pte);
+	} else {
+		/* Presume this has been called with clear==true previously */
+		set_pte_at(&init_mm, addr, pte, __pte(*old));
+	}
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(f->addr, &level);
+
+	if (!pte) {
+		pr_err("no pte for addr 0x%08lx\n", f->addr);
+		return -1;
+	}
+
+	if (level == PMD_SHIFT)
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+	else if (level == PAGE_SHIFT)
+		clear_pte_presence(pte, clear, &f->old_presence, f->addr);
+	else {
+		pr_err("unexpected page level 0x%x.\n", level);
+		return -1;
+	}
+
+	mmap_read_lock(&init_mm);
+	struct vm_area_struct *vma = find_vma(&init_mm, f->addr);
+
+	mmap_read_unlock(&init_mm);
+
+	flush_tlb_page(vma, f->addr);
+
+	return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret;
+
+	WARN_ONCE(f->armed, pr_fmt("kmmio page already armed.\n"));
+	if (f->armed) {
+		pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
+			f->addr, f->count, !!f->old_presence);
+	}
+	ret = clear_page_presence(f, true);
+	WARN_ONCE(ret < 0, pr_fmt("arming at 0x%08lx failed.\n"),
+		  f->addr);
+	f->armed = true;
+	return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret = clear_page_presence(f, false);
+
+	WARN_ONCE(ret < 0,
+			KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
+	f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
+	unsigned long page_base = addr;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return -EINVAL;
+	page_base &= page_level_mask(l);
+
+	/*
+	 * Hold the RCU read lock over single stepping to avoid looking
+	 * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
+	 * also disables preemption and prevents process switch during
+	 * the single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run.
+	 */
+	rcu_read_lock_sched_notrace();
+
+	faultpage = get_kmmio_fault_page(page_base);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. The latter case should not be possible.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = this_cpu_ptr(&kmmio_ctx);
+	if (ctx->active) {
+		if (page_base == ctx->addr) {
+			/*
+			 * A second fault on the same page means some other
+			 * condition needs handling by do_page_fault(), the
+			 * page really not being present is the most common.
+			 */
+			pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+				 addr, smp_processor_id());
+
+			if (!faultpage->old_presence)
+				pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+					addr, smp_processor_id());
+		} else {
+			/*
+			 * Prevent overwriting already in-flight context.
+			 * This should not happen, let's hope disarming at
+			 * least prevents a panic.
+			 */
+			pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+				 smp_processor_id(), addr);
+			pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+			disarm_kmmio_fault_page(faultpage);
+		}
+		goto no_kmmio;
+	}
+	ctx->active++;
+
+	ctx->fpage = faultpage;
+	ctx->probe = get_kmmio_probe(page_base);
+	ctx->saved_flags = (regs->msr & (MSR_SE | MSR_EE));
+	ctx->saved_softe = regs->softe;
+	ctx->addr = page_base;
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
+	regs->msr |= MSR_SE;         // single step
+	regs->msr &= ~MSR_EE;        // hard interrupt
+	regs->softe = IRQS_DISABLED; // soft interrupt
+
+	local_paca->srr_valid = 0;
+
+	/* Now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage);
+
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
+	return 1; /* fault handled */
+
+no_kmmio:
+	rcu_read_unlock_sched_notrace();
+	return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int ret = 0;
+	struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
+
+	if (!ctx->active) {
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
+		goto out;
+	}
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	/* Prevent racing against release_kmmio_fault_page(). */
+	arch_spin_lock(&kmmio_lock);
+	if (ctx->fpage->count)
+		arm_kmmio_fault_page(ctx->fpage);
+	arch_spin_unlock(&kmmio_lock);
+
+	// disabled single step in entry of single_step_exception.
+	// regs->msr &= ~MSR_SE;
+	regs->msr |= ctx->saved_flags;
+	regs->softe = ctx->saved_softe;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	BUG_ON(ctx->active);
+	rcu_read_unlock_sched_notrace();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (!(regs->msr & MSR_SE))
+		ret = 1;
+out:
+	return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long addr)
+{
+	struct kmmio_fault_page *f;
+
+	f = get_kmmio_fault_page(addr);
+	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f);
+		f->count++;
+		return 0;
+	}
+
+	f = kzalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->addr = addr;
+
+	if (arm_kmmio_fault_page(f)) {
+		kfree(f);
+		return -1;
+	}
+
+	list_add_rcu(&f->list, kmmio_page_list(f->addr));
+
+	return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long addr,
+				struct kmmio_fault_page **release_list)
+{
+	struct kmmio_fault_page *f;
+
+	f = get_kmmio_fault_page(addr);
+	if (!f)
+		return;
+
+	f->count--;
+	BUG_ON(f->count < 0);
+	if (!f->count) {
+		disarm_kmmio_fault_page(f);
+		if (!f->scheduled_for_release) {
+			f->release_next = *release_list;
+			*release_list = f;
+			f->scheduled_for_release = true;
+		}
+	}
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	int ret = 0;
+	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	unsigned int l;
+	pte_t *pte;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	if (get_kmmio_probe(addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	pte = lookup_address(addr, &l);
+	if (!pte) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	kmmio_count++;
+	list_add_rcu(&p->list, &kmmio_probes);
+	while (size < size_lim) {
+		if (add_kmmio_fault_page(addr + size))
+			pr_err("Unable to set page fault.\n");
+		size += page_level_size(l);
+	}
+out:
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore. It seems it's not needed after all.
+	 */
+	return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
+		} else {
+			*prevp = f->release_next;
+			f->release_next = NULL;
+			f->scheduled_for_release = false;
+		}
+		f = *prevp;
+	}
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
+	unsigned int l;
+	pte_t *pte;
+
+	pte = lookup_address(addr, &l);
+	if (!pte)
+		return;
+
+	local_irq_save(flags);
+	arch_spin_lock(&kmmio_lock);
+	while (size < size_lim) {
+		release_kmmio_fault_page(addr + size, &release_list);
+		size += page_level_size(l);
+	}
+	list_del_rcu(&p->list);
+	kmmio_count--;
+	arch_spin_unlock(&kmmio_lock);
+	local_irq_restore(flags);
+
+	if (!release_list)
+		return;
+
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease)
+		return;
+
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+	struct die_args *arg = args;
+
+	if (val == DIE_SSTEP && post_kmmio_handler(0, arg->regs) == 1)
+		return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+	int i;
+
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+	return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			  pr_fmt("kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"));
+	}
+}
diff --git a/arch/powerpc/mm/mmio-mod.c b/arch/powerpc/mm/mmio-mod.c
new file mode 100644
index 000000000000..68ba9f028678
--- /dev/null
+++ b/arch/powerpc/mm/mmio-mod.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/mmio-mod.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+#include <linux/pgtable.h>
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+#include "mmiotrace_arch.h"
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	resource_size_t phys;
+	unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long	filter_offset;
+static bool		nommiotrace;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(address, &level);
+
+	if (!pte) {
+		pr_err("Error in %s: no pte for page 0x%08lx\n",
+		       __func__, address);
+		return;
+	}
+
+	if (level == PMD_SHIFT) {
+		pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+			 address);
+		BUG();
+	}
+	pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+		address,
+		(unsigned long long)pte_val(*pte),
+		(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+
+	pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+		 addr, my_reason->addr);
+	print_pte(addr);
+	pr_emerg("faulting IP is at %pS\n", (void *)regs->nip);
+	pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip);
+	put_cpu_var(pf_reason);
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	const unsigned long instptr = instruction_pointer(regs);
+	struct opcode_t *opcode = get_opcode((unsigned int *)instptr);
+	enum mm_io_opcode type = get_ins_type(opcode);
+	struct remap_trace *trace = p->private;
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (my_reason->active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		my_reason->active_traces++;
+
+	if (!opcode) {
+		pr_warn("The ins may be not included in src. Tell the dever follow info:");
+		pr_warn("ins_addr: 0x%lx    ins: 0x%lx", instptr, *(unsigned long *)instptr);
+	}
+
+	my_reason->opcode = opcode;
+
+	my_reason->addr = addr;
+	my_reason->ip = instptr;
+
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
+
+	my_trace->pc = instptr;
+
+	my_trace->opcode = type;
+	my_trace->width = get_ins_width(opcode);
+
+	if (type == MMIO_WRITE)
+		my_trace->value = get_ins_val(my_reason, regs);
+
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct opcode_t *opcode = my_reason->opcode;
+	enum mm_io_opcode type = get_ins_type(opcode);
+
+	/* this should always return the active_trace count to 0 */
+	my_reason->active_traces--;
+	if (my_reason->active_traces) {
+		pr_emerg("unexpected post handler");
+		BUG();
+	}
+
+	if (type == MMIO_READ)
+		my_trace->value = get_ins_val(my_reason, regs);
+
+	mmio_trace_rw(my_trace);
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+							void __iomem *addr)
+{
+	static atomic_t next_id;
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	/* These are page-unaligned. */
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
+	};
+
+	if (!trace) {
+		pr_err("kmalloc failed in ioremap\n");
+		return;
+	}
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+			.private = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
+	};
+	map.map_id = trace->id;
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled()) {
+		kfree(trace);
+		goto not_enabled;
+	}
+
+	mmio_trace_mapping(&map);
+	list_add_tail(&trace->list, &trace_list);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+						void __iomem *addr)
+{
+	pr_err("ioremap_*(0x%llx, 0x%lx) = %p\n",
+		 (unsigned long long)offset, size, addr);
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
+		return;
+
+	pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+		 (unsigned long long)offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+	ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug("Unmapping %p.\n", addr);
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			found_trace = trace;
+			break;
+		}
+	}
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+	int ret = 0;
+	va_list args;
+	unsigned long flags;
+
+	va_start(args, fmt);
+
+	spin_lock_irqsave(&trace_lock, flags);
+	if (is_enabled())
+		ret = mmio_trace_printk(fmt, args);
+	spin_unlock_irqrestore(&trace_lock, flags);
+
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+			  trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		list_del(&trace->list);
+		kfree(trace);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (!cpumask_available(downed_cpus) &&
+	    !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+		pr_notice("Failed to allocate mask\n");
+		goto out;
+	}
+
+	cpus_read_lock();
+	cpumask_copy(downed_cpus, cpu_online_mask);
+	cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+	if (num_online_cpus() > 1)
+		pr_notice("Disabling non-boot CPUs...\n");
+	cpus_read_unlock();
+
+	for_each_cpu(cpu, downed_cpus) {
+		err = remove_cpu(cpu);
+		if (!err)
+			pr_info("CPU%d is down.\n", cpu);
+		else
+			pr_err("Error taking CPU%d down: %d\n", cpu, err);
+	}
+out:
+	if (num_online_cpus() > 1)
+		pr_warn("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
+		return;
+	pr_notice("Re-enabling CPUs...\n");
+	for_each_cpu(cpu, downed_cpus) {
+		err = add_cpu(cpu);
+		if (!err)
+			pr_info("enabled CPU%d.\n", cpu);
+		else
+			pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+	}
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+	if (num_online_cpus() > 1)
+		pr_warn("multiple CPUs are online, may miss events. Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+	if (nommiotrace)
+		pr_info("MMIO tracing disabled.\n");
+	kmmio_init();
+	enter_uniprocessor();
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info("enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	leave_uniprocessor();
+	kmmio_cleanup();
+	pr_info("disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.c b/arch/powerpc/mm/mmiotrace_arch.c
new file mode 100644
index 000000000000..ccc8032384ef
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/hugetlb.h>
+
+#include "mmiotrace_arch.h"
+
+static pte_t *mmiotrace_find_linux_pte(pgd_t *pgdp, unsigned long ea,
+			bool *is_thp, unsigned int *hpage_shift)
+{
+	p4d_t p4d, *p4dp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t *ret_pte;
+	hugepd_t *hpdp = NULL;
+	unsigned int pdshift;
+
+	if (hpage_shift)
+		*hpage_shift = 0;
+
+	if (is_thp)
+		*is_thp = false;
+
+	/*
+	 * Always operate on the local stack value. This make sure the
+	 * value don't get updated by a parallel THP split/collapse,
+	 * page fault or a page unmap. The return pte_t * is still not
+	 * stable. So should be checked there for above conditions.
+	 * Top level is an exception because it is folded into p4d.
+	 */
+	p4dp = p4d_offset(pgdp, ea);
+	p4d  = READ_ONCE(*p4dp);
+	pdshift = P4D_SHIFT;
+
+	if (p4d_none(p4d))
+		return NULL;
+
+	if (p4d_leaf(p4d)) {
+		ret_pte = (pte_t *)p4dp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+		hpdp = (hugepd_t *)&p4d;
+		goto out_huge;
+	}
+
+	/*
+	 * Even if we end up with an unmap, the pgtable will not
+	 * be freed, because we do an rcu free and here we are
+	 * irq disabled
+	 */
+	pdshift = PUD_SHIFT;
+	pudp = pud_offset(&p4d, ea);
+	pud  = READ_ONCE(*pudp);
+
+	if (pud_none(pud))
+		return NULL;
+
+	if (pud_leaf(pud)) {
+		ret_pte = (pte_t *)pudp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(pud_val(pud)))) {
+		hpdp = (hugepd_t *)&pud;
+		goto out_huge;
+	}
+
+	pdshift = PMD_SHIFT;
+	pmdp = pmd_offset(&pud, ea);
+	pmd  = READ_ONCE(*pmdp);
+
+	/*
+	 * A hugepage collapse is captured by this condition, see
+	 * pmdp_collapse_flush.
+	 */
+	if (pmd_none(pmd))
+		return NULL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * A hugepage split is captured by this condition, see
+	 * pmdp_invalidate.
+	 *
+	 * Huge page modification can be caught here too.
+	 */
+	if (pmd_is_serializing(pmd))
+		return NULL;
+#endif
+
+	if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+		if (is_thp)
+			*is_thp = true;
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+
+	if (pmd_leaf(pmd)) {
+		ret_pte = (pte_t *)pmdp;
+		goto out;
+	}
+
+	if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+		hpdp = (hugepd_t *)&pmd;
+		goto out_huge;
+	}
+
+	pdshift = PAGE_SHIFT;
+
+	if (hpage_shift)
+		*hpage_shift = pdshift;
+
+	return pte_offset_kernel(&pmd, ea);
+
+out_huge:
+	if (!hpdp)
+		return NULL;
+
+	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
+	pdshift = hugepd_shift(*hpdp);
+out:
+	if (hpage_shift)
+		*hpage_shift = pdshift;
+	return ret_pte;
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *shift)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pte_t *pte = mmiotrace_find_linux_pte(pgd_offset_k(address), address, NULL, shift);
+
+	local_irq_restore(flags);
+
+	return pte;
+}
diff --git a/arch/powerpc/mm/mmiotrace_arch.h b/arch/powerpc/mm/mmiotrace_arch.h
new file mode 100644
index 000000000000..f4a5bff24a07
--- /dev/null
+++ b/arch/powerpc/mm/mmiotrace_arch.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/powerpc/mm/pgtable.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#ifndef __MMIOTRACE_ARCH_
+#define __MMIOTRACE_ARCH_
+#include <asm/pgtable.h>
+
+static inline int page_level_shift(unsigned int level)
+{
+	return level;
+}
+static inline unsigned long page_level_size(unsigned int level)
+{
+	return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(unsigned int level)
+{
+	return ~(page_level_size(level) - 1);
+}
+
+pte_t *lookup_address(unsigned long address, unsigned int *level);
+#endif // __MMIOTRACE_ARCH_
diff --git a/arch/powerpc/mm/pf_in.c b/arch/powerpc/mm/pf_in.c
new file mode 100644
index 000000000000..e6c90b383e7f
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Derived from arch/x86/mm/pf_in.c:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+#include <linux/printk.h>
+#include <linux/mmiotrace.h>
+
+/* D 32 0x80000000 B lwz Load Word and Zero */
+/* D 33 0x84000000 B lwz Load Word and Zero with Update */
+/* D 34 0x88000000 B lbz Load Byte and Zero */
+/* D 33 0x8C000000 B lbzu Load Word and Zero with Update */
+/* D 35 0x90000000 B stw Store Word */
+/* D 36 0x94000000 B stwu Store Word with Update */
+/* D 37 0x98000000 B stb Store Byte */
+/* D 38 0x9C000000 B stbu Store Byte with Update */
+/* D 40 0xA0000000 B lhz Load Halfword and Zero with Update */
+/* D 41 0xA4000000 B lhzu Load Halfword and Zero with Update */
+/* D 42 0xA8000000 B lha Load Halfword Algebraic */
+/* D 43 0xAC000000 B lhau Load Halfword Algebraic with Update */
+/* D 44 0xB0000000 B sth Store Halfword */
+/* D 45 0xB4000000 B sthu Store Halfword with Update */
+/* D 46 0xB8000000 B lmw Load Multiple Word */
+/* D 47 0xBC000000 B stmw Store Multiple Word */
+/* D 48 0xC0000000 FP lfs Load Floating-Point Single */
+/* D 49 0xC4000000 FP lfsu Load Floating-Point Single with Update */
+/* D 50 0xC8000000 FP lfd Load Floating-Point Double */
+/* D 51 0xCC000000 FP lfdu Load Floating-Point Double with Update */
+/* D 52 0xD0000000 FP stfs Store Floating-Point Single */
+/* D 53 0xD4000000 FP stfsu Store Floating-Point Single with Update */
+/* D 54 0xD8000000 FP stfd Store Floating-Point Double */
+/* D 55 0xDC000000 FP stfdu Store Floating-Point Double with Update */
+/* DQ 56 0xE0000000 P 58 LSQ lq Load Quadword */
+/* DS 57 0xE4000000 140 FP.out Lfdp Load Floating-Point Double Pair */
+/* DS 58 0xE8000000 53 64 Ldu Load Doubleword with Update */
+/* DS 58 0xE8000001 53 64 Ld Load Doubleword */
+/* DS 58 0xE8000002 52 64 Lwa Load Word Algebraic */
+/* DS 62 0xF8000000 57 64 std Store Doubleword */
+/* DS 62 0xF8000001 57 64 stdu Store Doubleword with Update */
+/* DS 62 0xF8000002 59 LSQ stq Store Quadword */
+
+// D-form:
+// 0-5    6-10    11-15   16-31
+// opcode RT      RA      Offset
+
+// DQ-form:
+// 0-5    6-10  11-15  16-27
+// opcode RT    RA     Offset
+
+// DS-form:
+// 0-5    6-10  11-15  16-29  30-31
+// opcode RT    RA     Offset opcode
+
+#define D_OPCODE_MASK GENMASK(31, 26)
+#define DQ_OPCODE_MASK D_OPCODE_MASK
+#define DS_OPCODE_MASK (D_OPCODE_MASK | GENMASK(0, 1))
+#define RS_RT_OFFSET 21UL
+#define RS_RT_MASK GENMASK(25, 21)
+#define RA_MASK GENMASK(20, 16)
+#define D_OFFSET GENMASK(15, 0)
+#define DQ_OFFSET GENMASK(15, 4)
+#define DS_OFFSET GENMASK(15, 2)
+
+struct opcode_t opcodes[] = {
+	{0x80000000, D_FORMAT, "lwz", },
+	{0x84000000, D_FORMAT, "lwzu", },
+	{0x88000000, D_FORMAT, "lbz", },
+	{0x8C000000, D_FORMAT, "lbzu", },
+	{0x90000000, D_FORMAT, "stw", },
+	{0x94000000, D_FORMAT, "stwu", },
+	{0x98000000, D_FORMAT, "stb", },
+	{0x9C000000, D_FORMAT, "stbu", },
+	{0xA0000000, D_FORMAT, "lhz", },
+	{0xA4000000, D_FORMAT, "lhzu", },
+	{0xA8000000, D_FORMAT, "lha", },
+	{0xAC000000, D_FORMAT, "lhau", },
+	{0xB0000000, D_FORMAT, "sth", },
+	{0xB4000000, D_FORMAT, "sthu", },
+	{0xB8000000, D_FORMAT, "lmw", },
+	{0xBC000000, D_FORMAT, "stmw", },
+	{0xC0000000, D_FORMAT, "lfs", },
+	{0xC4000000, D_FORMAT, "lfsu", },
+	{0xC8000000, D_FORMAT, "lfd", },
+	{0xCC000000, D_FORMAT, "lfdu", },
+	{0xD0000000, D_FORMAT, "stfs", },
+	{0xD4000000, D_FORMAT, "stfsu", },
+	{0xD8000000, D_FORMAT, "stfd", },
+	{0xDC000000, D_FORMAT, "stfdu", },
+	{0xE0000000, DQ_FORMAT, "lq", },
+	{0xE4000000, DS_FORMAT, "lfdp", },
+	{0xE8000000, DS_FORMAT, "ldu", },
+	{0xE8000001, DS_FORMAT, "ld", },
+	{0xE8000002, DS_FORMAT, "lwa", },
+	{0xF8000000, DS_FORMAT, "std", },
+	{0xF8000001, DS_FORMAT, "stdu", },
+	{0xF8000002, DS_FORMAT, "stq", }
+};
+
+struct opcode_t *get_opcode(unsigned int *addr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(opcodes); i++) {
+		switch (opcodes[i].form) {
+		case D_FORMAT:
+			if (opcodes[i].opcode == (*addr & D_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		case DQ_FORMAT:
+			if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		case DS_FORMAT:
+			if (opcodes[i].opcode == (*addr & DQ_OPCODE_MASK))
+				return &opcodes[i];
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+inline enum mm_io_opcode get_ins_type(struct opcode_t *opcode)
+{
+	if (!opcode)
+		return MMIO_UNKNOWN_OP;
+
+	if (opcode->name[0] == 'l')
+		return MMIO_READ;
+
+	if (opcode->name[0] == 's')
+		return MMIO_WRITE;
+
+	return MMIO_UNKNOWN_OP;
+}
+
+unsigned int get_ins_width(struct opcode_t *opcode)
+{
+	char width_ch;
+
+	if (!opcode)
+		return 0;
+
+	if (opcode->name[0] == 'l')
+		width_ch = opcode->name[1];
+
+	if (opcode->name[0] == 's')
+		width_ch = opcode->name[2];
+
+	switch (width_ch) {
+	case 'b': /* byte */
+		return 1;
+	case 'h': /* half word */
+		return sizeof(long) / 2;
+	case 'w': /* word */
+		/* return sizeof(long); */
+	case 'm': /* multi words(can be calculated out by (32-RT) * sizeof(long)) */
+	case 'f': /* float(not too much. So ignore word number) */
+	case 'd': /* double words */
+		/* return 2 * sizeof(long); */
+	case 'q': /* quad words */
+		/* return 4 * sizeof(long); */
+	default:
+		return sizeof(long);
+	}
+}
+
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs)
+{
+	struct opcode_t *opcode = reason->opcode;
+	unsigned int ins = *(unsigned int *)(reason->ip);
+	unsigned int reg_no;
+	unsigned long mask = ~0UL;
+
+	if (!opcode)
+		return 0;
+
+	mask >>= 8 * (sizeof(long) - get_ins_width(opcode));
+	reg_no = (ins & RS_RT_MASK) >> RS_RT_OFFSET;
+
+	return regs->gpr[reg_no] & mask;
+}
diff --git a/arch/powerpc/mm/pf_in.h b/arch/powerpc/mm/pf_in.h
new file mode 100644
index 000000000000..905ba4937137
--- /dev/null
+++ b/arch/powerpc/mm/pf_in.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Derived from arch/x86/mm/pf_in.h:
+ *   Copyright (C) 2024 Jialong Yang (jialong.yang@...ngroup.cn)
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum OPCODE_FORMAT {
+	D_FORMAT,
+	DQ_FORMAT,
+	DS_FORMAT,
+};
+
+struct opcode_t {
+	unsigned int opcode;
+	enum OPCODE_FORMAT form;
+	const char *name;
+};
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	struct opcode_t *opcode;
+	int active_traces;
+};
+
+struct opcode_t *get_opcode(unsigned int *addr);
+enum mm_io_opcode get_ins_type(struct opcode_t *opcode);
+unsigned int get_ins_width(struct opcode_t *opcode);
+unsigned long get_ins_val(struct trap_reason *reason, struct pt_regs *regs);
+#endif /* __PF_H_ */
-- 
2.34.1