[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <86bjkyrly9.wl-maz@kernel.org>
Date: Wed, 19 Nov 2025 17:06:38 +0000
From: Marc Zyngier <maz@...nel.org>
To: Vincent Donnefort <vdonnefort@...gle.com>
Cc: rostedt@...dmis.org,
mhiramat@...nel.org,
mathieu.desnoyers@...icios.com,
linux-trace-kernel@...r.kernel.org,
oliver.upton@...ux.dev,
joey.gouly@....com,
suzuki.poulose@....com,
yuzenghui@...wei.com,
kvmarm@...ts.linux.dev,
linux-arm-kernel@...ts.infradead.org,
jstultz@...gle.com,
qperret@...gle.com,
will@...nel.org,
aneesh.kumar@...nel.org,
kernel-team@...roid.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v8 21/28] KVM: arm64: Add tracing capability for the pKVM hyp
On Fri, 07 Nov 2025 09:38:33 +0000,
Vincent Donnefort <vdonnefort@...gle.com> wrote:
>
> When running with protected mode, the host has very little knowledge
> about what is happening in the hypervisor. Of course this is an
> essential feature for security but nonetheless, that piece of code
> growing with more responsibilities, we need now a way to debug and
> profile it. Tracefs by its reliability, versatility and support for
> user-space is the perfect tool.
>
> There's no way the hypervisor could log events directly into the host
> tracefs ring-buffers. So instead let's use our own, where the hypervisor
> is the writer and the host the reader.
>
> Signed-off-by: Vincent Donnefort <vdonnefort@...gle.com>
>
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index 9da54d4ee49e..ad02dee140d3 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -89,6 +89,10 @@ enum __kvm_host_smccc_func {
> __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
> __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
> __KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
> + __KVM_HOST_SMCCC_FUNC___pkvm_load_tracing,
> + __KVM_HOST_SMCCC_FUNC___pkvm_unload_tracing,
> + __KVM_HOST_SMCCC_FUNC___pkvm_enable_tracing,
> + __KVM_HOST_SMCCC_FUNC___pkvm_swap_reader_tracing,
> };
>
> #define DECLARE_KVM_VHE_SYM(sym) extern char sym[]
> diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h
> new file mode 100644
> index 000000000000..9c30a479bc36
> --- /dev/null
> +++ b/arch/arm64/include/asm/kvm_hyptrace.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ARM64_KVM_HYPTRACE_H_
> +#define __ARM64_KVM_HYPTRACE_H_
> +
> +#include <linux/ring_buffer.h>
> +
> +struct hyp_trace_desc {
> + unsigned long bpages_backing_start;
Why is this an integer type? You keep casting it all over the place,
which tells me that's not the ideal type.
> + size_t bpages_backing_size;
> + struct trace_buffer_desc trace_buffer_desc;
> +
> +};
> +#endif
> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> index 4f803fd1c99a..580426cdbe77 100644
> --- a/arch/arm64/kvm/Kconfig
> +++ b/arch/arm64/kvm/Kconfig
> @@ -83,4 +83,11 @@ config PTDUMP_STAGE2_DEBUGFS
>
> If in doubt, say N.
>
> +config PKVM_TRACING
> + bool
> + depends on KVM
> + depends on TRACING
> + select SIMPLE_RING_BUFFER
> + default y
I'd rather this is made to depend on NVHE_EL2_DEBUG, just like the
other debug options.
> +
> endif # VIRTUALIZATION
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
> new file mode 100644
> index 000000000000..996e90c0974f
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H
> +#define __ARM64_KVM_HYP_NVHE_TRACE_H
> +#include <asm/kvm_hyptrace.h>
> +
> +#ifdef CONFIG_PKVM_TRACING
> +void *tracing_reserve_entry(unsigned long length);
> +void tracing_commit_entry(void);
> +
> +int __pkvm_load_tracing(unsigned long desc_va, size_t desc_size);
> +void __pkvm_unload_tracing(void);
> +int __pkvm_enable_tracing(bool enable);
> +int __pkvm_swap_reader_tracing(unsigned int cpu);
> +#else
> +static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
> +static inline void tracing_commit_entry(void) { }
> +
> +static inline int __pkvm_load_tracing(unsigned long desc_va, size_t desc_size) { return -ENODEV; }
> +static inline void __pkvm_unload_tracing(void) { }
> +static inline int __pkvm_enable_tracing(bool enable) { return -ENODEV; }
> +static inline int __pkvm_swap_reader_tracing(unsigned int cpu) { return -ENODEV; }
> +#endif
> +#endif
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
> index f55a9a17d38f..504c3b9caef8 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -29,7 +29,7 @@ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
> ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
> hyp-obj-y += ../../../kernel/smccc-call.o
> hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
> -hyp-obj-$(CONFIG_PKVM_TRACING) += clock.o
> +hyp-obj-$(CONFIG_PKVM_TRACING) += clock.o trace.o ../../../../../kernel/trace/simple_ring_buffer.o
Can we get something less awful here? Surely there is a way to get an
absolute path from the kbuild infrastructure? $(objtree) springs to
mind...
> hyp-obj-y += $(lib-objs)
>
> ##
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index 29430c031095..6381e50ff531 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -18,6 +18,7 @@
> #include <nvhe/mem_protect.h>
> #include <nvhe/mm.h>
> #include <nvhe/pkvm.h>
> +#include <nvhe/trace.h>
> #include <nvhe/trap_handler.h>
>
> DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
> @@ -585,6 +586,35 @@ static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
> cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
> }
>
> +static void handle___pkvm_load_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> + DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
> + DECLARE_REG(size_t, desc_size, host_ctxt, 2);
> +
> + cpu_reg(host_ctxt, 1) = __pkvm_load_tracing(desc_hva, desc_size);
> +}
> +
> +static void handle___pkvm_unload_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> + __pkvm_unload_tracing();
> +
> + cpu_reg(host_ctxt, 1) = 0;
> +}
> +
> +static void handle___pkvm_enable_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> + DECLARE_REG(bool, enable, host_ctxt, 1);
> +
> + cpu_reg(host_ctxt, 1) = __pkvm_enable_tracing(enable);
> +}
> +
> +static void handle___pkvm_swap_reader_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> + DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
> +
> + cpu_reg(host_ctxt, 1) = __pkvm_swap_reader_tracing(cpu);
> +}
> +
> typedef void (*hcall_t)(struct kvm_cpu_context *);
>
> #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
> @@ -626,6 +656,10 @@ static const hcall_t host_hcall[] = {
> HANDLE_FUNC(__pkvm_vcpu_load),
> HANDLE_FUNC(__pkvm_vcpu_put),
> HANDLE_FUNC(__pkvm_tlb_flush_vmid),
> + HANDLE_FUNC(__pkvm_load_tracing),
> + HANDLE_FUNC(__pkvm_unload_tracing),
> + HANDLE_FUNC(__pkvm_enable_tracing),
> + HANDLE_FUNC(__pkvm_swap_reader_tracing),
> };
>
> static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
> diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
> new file mode 100644
> index 000000000000..def5cbc75722
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/nvhe/trace.c
> @@ -0,0 +1,257 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2025 Google LLC
> + * Author: Vincent Donnefort <vdonnefort@...gle.com>
> + */
> +
> +#include <nvhe/clock.h>
> +#include <nvhe/mem_protect.h>
> +#include <nvhe/mm.h>
> +#include <nvhe/trace.h>
> +
> +#include <asm/percpu.h>
> +#include <asm/kvm_mmu.h>
> +#include <asm/local.h>
> +
> +#include <linux/simple_ring_buffer.h>
> +
> +static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs);
> +
> +static struct hyp_trace_buffer {
> + struct simple_rb_per_cpu __percpu *simple_rbs;
> + unsigned long bpages_backing_start;
> + size_t bpages_backing_size;
> + hyp_spinlock_t lock;
> +} trace_buffer = {
> + .simple_rbs = &__simple_rbs,
> + .lock = __HYP_SPIN_LOCK_UNLOCKED,
> +};
> +
> +static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer)
> +{
> + return trace_buffer->bpages_backing_size > 0;
> +}
> +
> +void *tracing_reserve_entry(unsigned long length)
> +{
> + return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length,
> + trace_clock());
> +}
> +
> +void tracing_commit_entry(void)
> +{
> + simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs));
> +}
> +
> +static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer,
> + struct hyp_trace_desc *desc)
> +{
> + unsigned long start = kern_hyp_va(desc->bpages_backing_start);
> + size_t size = desc->bpages_backing_size;
> + int ret;
> +
> + if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size))
> + return -EINVAL;
> +
> + ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)start), size >> PAGE_SHIFT);
> + if (ret)
> + return ret;
> +
> + memset((void *)start, 0, size);
> +
> + trace_buffer->bpages_backing_start = start;
> + trace_buffer->bpages_backing_size = size;
> +
> + return 0;
> +}
> +
> +static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer)
> +{
> + unsigned long start = trace_buffer->bpages_backing_start;
> + size_t size = trace_buffer->bpages_backing_size;
> +
> + if (!size)
> + return;
> +
> + memset((void *)start, 0, size);
> +
> + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT));
> +
> + trace_buffer->bpages_backing_start = 0;
> + trace_buffer->bpages_backing_size = 0;
> +}
> +
> +static void *__pin_shared_page(unsigned long kern_va)
> +{
> + void *va = kern_hyp_va((void *)kern_va);
> +
> + return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va;
> +}
> +
> +static void __unpin_shared_page(void *va)
> +{
> + hyp_unpin_shared_mem(va, va + PAGE_SIZE);
> +}
> +
> +static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer)
> +{
> + int cpu;
> +
> + hyp_assert_lock_held(&trace_buffer->lock);
> +
> + if (!hyp_trace_buffer_loaded(trace_buffer))
> + return;
> +
> + for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
> + __simple_ring_buffer_unload(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
> + __unpin_shared_page);
> +
> + hyp_trace_buffer_unload_bpage_backing(trace_buffer);
> +}
> +
> +static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
> + struct hyp_trace_desc *desc)
> +{
> + struct simple_buffer_page *bpages;
> + struct ring_buffer_desc *rb_desc;
> + int ret, cpu;
> +
> + hyp_assert_lock_held(&trace_buffer->lock);
> +
> + if (hyp_trace_buffer_loaded(trace_buffer))
> + return -EINVAL;
> +
> + ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc);
> + if (ret)
> + return ret;
> +
> + bpages = (struct simple_buffer_page *)trace_buffer->bpages_backing_start;
> + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
> + ret = __simple_ring_buffer_init(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
> + bpages, rb_desc, __pin_shared_page,
> + __unpin_shared_page);
> + if (ret)
> + break;
> +
> + bpages += rb_desc->nr_page_va;
> + }
> +
> + if (ret)
> + hyp_trace_buffer_unload(trace_buffer);
> +
> + return ret;
> +}
> +
> +static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size)
> +{
> + struct simple_buffer_page *bpages = (struct simple_buffer_page *)desc->bpages_backing_start;
> + struct ring_buffer_desc *rb_desc;
> + void *bpages_end, *desc_end;
> + unsigned int cpu;
> +
> + desc_end = (void *)desc + desc_size; /* __pkvm_host_donate_hyp validates desc_size */
> +
> + bpages_end = (void *)desc->bpages_backing_start + desc->bpages_backing_size;
> + if (bpages_end < (void *)desc->bpages_backing_start)
> + return false;
> +
> + for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
> + /* Can we read nr_page_va? */
> + if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end)
> + return false;
> +
> + /* Overflow desc? */
> + if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end)
> + return false;
> +
> + /* Overflow bpages backing memory? */
> + if ((void *)(bpages + rb_desc->nr_page_va) > bpages_end)
> + return false;
> +
> + if (cpu >= hyp_nr_cpus)
> + return false;
> +
> + if (cpu != rb_desc->cpu)
> + return false;
> +
> + bpages += rb_desc->nr_page_va;
> + }
> +
> + return true;
> +}
> +
> +int __pkvm_load_tracing(unsigned long desc_hva, size_t desc_size)
> +{
> + struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva);
> + int ret;
> +
> + if (!desc_size || !PAGE_ALIGNED(desc_hva) || !PAGE_ALIGNED(desc_size))
> + return -EINVAL;
> +
> + ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)desc),
> + desc_size >> PAGE_SHIFT);
> + if (ret)
> + return ret;
> +
> + if (!hyp_trace_desc_validate(desc, desc_size))
> + goto err_donate_desc;
> +
> + hyp_spin_lock(&trace_buffer.lock);
> +
> + ret = hyp_trace_buffer_load(&trace_buffer, desc);
> +
> + hyp_spin_unlock(&trace_buffer.lock);
> +
> +err_donate_desc:
> + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn((void *)desc),
> + desc_size >> PAGE_SHIFT));
That's basically a guaranteed panic if anything goes wrong. Are you
sure you want to do that?
> + return ret;
> +}
> +
> +void __pkvm_unload_tracing(void)
> +{
> + hyp_spin_lock(&trace_buffer.lock);
> + hyp_trace_buffer_unload(&trace_buffer);
> + hyp_spin_unlock(&trace_buffer.lock);
> +}
> +
> +int __pkvm_enable_tracing(bool enable)
> +{
> + int cpu, ret = enable ? -EINVAL : 0;
> +
> + hyp_spin_lock(&trace_buffer.lock);
> +
> + if (!hyp_trace_buffer_loaded(&trace_buffer))
> + goto unlock;
> +
> + for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
> + simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu),
> + enable);
> +
> + ret = 0;
> +
> +unlock:
> + hyp_spin_unlock(&trace_buffer.lock);
> +
> + return ret;
> +}
> +
> +int __pkvm_swap_reader_tracing(unsigned int cpu)
> +{
> + int ret;
> +
> + if (cpu >= hyp_nr_cpus)
> + return -EINVAL;
> +
> + hyp_spin_lock(&trace_buffer.lock);
> +
> + if (hyp_trace_buffer_loaded(&trace_buffer))
> + ret = simple_ring_buffer_swap_reader_page(
> + per_cpu_ptr(trace_buffer.simple_rbs, cpu));
Please keep these things on a single line. I don't care what people
(of checkpatch) say.
> + else
> + ret = -ENODEV;
> +
> + hyp_spin_unlock(&trace_buffer.lock);
> +
> + return ret;
> +}
> --
> 2.51.2.1041.gc1ab5b90ca-goog
>
>
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
Powered by blists - more mailing lists