lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <86bjkyrly9.wl-maz@kernel.org>
Date: Wed, 19 Nov 2025 17:06:38 +0000
From: Marc Zyngier <maz@...nel.org>
To: Vincent Donnefort <vdonnefort@...gle.com>
Cc: rostedt@...dmis.org,
	mhiramat@...nel.org,
	mathieu.desnoyers@...icios.com,
	linux-trace-kernel@...r.kernel.org,
	oliver.upton@...ux.dev,
	joey.gouly@....com,
	suzuki.poulose@....com,
	yuzenghui@...wei.com,
	kvmarm@...ts.linux.dev,
	linux-arm-kernel@...ts.infradead.org,
	jstultz@...gle.com,
	qperret@...gle.com,
	will@...nel.org,
	aneesh.kumar@...nel.org,
	kernel-team@...roid.com,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v8 21/28] KVM: arm64: Add tracing capability for the pKVM hyp

On Fri, 07 Nov 2025 09:38:33 +0000,
Vincent Donnefort <vdonnefort@...gle.com> wrote:
> 
> When running with protected mode, the host has very little knowledge
> about what is happening in the hypervisor. Of course this is an
> essential feature for security but nonetheless, that piece of code
> growing with more responsibilities, we need now a way to debug and
> profile it. Tracefs by its reliability, versatility and support for
> user-space is the perfect tool.
> 
> There's no way the hypervisor could log events directly into the host
> tracefs ring-buffers. So instead let's use our own, where the hypervisor
> is the writer and the host the reader.
> 
> Signed-off-by: Vincent Donnefort <vdonnefort@...gle.com>
> 
> diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
> index 9da54d4ee49e..ad02dee140d3 100644
> --- a/arch/arm64/include/asm/kvm_asm.h
> +++ b/arch/arm64/include/asm/kvm_asm.h
> @@ -89,6 +89,10 @@ enum __kvm_host_smccc_func {
>  	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
>  	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
>  	__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
> +	__KVM_HOST_SMCCC_FUNC___pkvm_load_tracing,
> +	__KVM_HOST_SMCCC_FUNC___pkvm_unload_tracing,
> +	__KVM_HOST_SMCCC_FUNC___pkvm_enable_tracing,
> +	__KVM_HOST_SMCCC_FUNC___pkvm_swap_reader_tracing,
>  };
>  
>  #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
> diff --git a/arch/arm64/include/asm/kvm_hyptrace.h b/arch/arm64/include/asm/kvm_hyptrace.h
> new file mode 100644
> index 000000000000..9c30a479bc36
> --- /dev/null
> +++ b/arch/arm64/include/asm/kvm_hyptrace.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ARM64_KVM_HYPTRACE_H_
> +#define __ARM64_KVM_HYPTRACE_H_
> +
> +#include <linux/ring_buffer.h>
> +
> +struct hyp_trace_desc {
> +	unsigned long			bpages_backing_start;

Why is this an integer type? You keep casting it all over the place,
which tells me that's not the ideal type.

> +	size_t				bpages_backing_size;
> +	struct trace_buffer_desc	trace_buffer_desc;
> +
> +};
> +#endif
> diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
> index 4f803fd1c99a..580426cdbe77 100644
> --- a/arch/arm64/kvm/Kconfig
> +++ b/arch/arm64/kvm/Kconfig
> @@ -83,4 +83,11 @@ config PTDUMP_STAGE2_DEBUGFS
>  
>  	  If in doubt, say N.
>  
> +config PKVM_TRACING
> +	bool
> +	depends on KVM
> +	depends on TRACING
> +	select SIMPLE_RING_BUFFER
> +	default y

I'd rather this is made to depend on NVHE_EL2_DEBUG, just like the
other debug options.

> +
>  endif # VIRTUALIZATION
> diff --git a/arch/arm64/kvm/hyp/include/nvhe/trace.h b/arch/arm64/kvm/hyp/include/nvhe/trace.h
> new file mode 100644
> index 000000000000..996e90c0974f
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/include/nvhe/trace.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ARM64_KVM_HYP_NVHE_TRACE_H
> +#define __ARM64_KVM_HYP_NVHE_TRACE_H
> +#include <asm/kvm_hyptrace.h>
> +
> +#ifdef CONFIG_PKVM_TRACING
> +void *tracing_reserve_entry(unsigned long length);
> +void tracing_commit_entry(void);
> +
> +int __pkvm_load_tracing(unsigned long desc_va, size_t desc_size);
> +void __pkvm_unload_tracing(void);
> +int __pkvm_enable_tracing(bool enable);
> +int __pkvm_swap_reader_tracing(unsigned int cpu);
> +#else
> +static inline void *tracing_reserve_entry(unsigned long length) { return NULL; }
> +static inline void tracing_commit_entry(void) { }
> +
> +static inline int __pkvm_load_tracing(unsigned long desc_va, size_t desc_size) { return -ENODEV; }
> +static inline void __pkvm_unload_tracing(void) { }
> +static inline int __pkvm_enable_tracing(bool enable) { return -ENODEV; }
> +static inline int __pkvm_swap_reader_tracing(unsigned int cpu) { return -ENODEV; }
> +#endif
> +#endif
> diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
> index f55a9a17d38f..504c3b9caef8 100644
> --- a/arch/arm64/kvm/hyp/nvhe/Makefile
> +++ b/arch/arm64/kvm/hyp/nvhe/Makefile
> @@ -29,7 +29,7 @@ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
>  	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
>  hyp-obj-y += ../../../kernel/smccc-call.o
>  hyp-obj-$(CONFIG_LIST_HARDENED) += list_debug.o
> -hyp-obj-$(CONFIG_PKVM_TRACING) += clock.o
> +hyp-obj-$(CONFIG_PKVM_TRACING) += clock.o trace.o ../../../../../kernel/trace/simple_ring_buffer.o

Can we get something less awful here? Surely there is a way to get an
absolute path from the kbuild infrastructure? $(objtree) springs to
mind...

>  hyp-obj-y += $(lib-objs)
>  
>  ##
> diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> index 29430c031095..6381e50ff531 100644
> --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
> @@ -18,6 +18,7 @@
>  #include <nvhe/mem_protect.h>
>  #include <nvhe/mm.h>
>  #include <nvhe/pkvm.h>
> +#include <nvhe/trace.h>
>  #include <nvhe/trap_handler.h>
>  
>  DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
> @@ -585,6 +586,35 @@ static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
>  	cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
>  }
>  
> +static void handle___pkvm_load_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> +	 DECLARE_REG(unsigned long, desc_hva, host_ctxt, 1);
> +	 DECLARE_REG(size_t, desc_size, host_ctxt, 2);
> +
> +	 cpu_reg(host_ctxt, 1) = __pkvm_load_tracing(desc_hva, desc_size);
> +}
> +
> +static void handle___pkvm_unload_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> +	__pkvm_unload_tracing();
> +
> +	cpu_reg(host_ctxt, 1) = 0;
> +}
> +
> +static void handle___pkvm_enable_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> +	DECLARE_REG(bool, enable, host_ctxt, 1);
> +
> +	cpu_reg(host_ctxt, 1) = __pkvm_enable_tracing(enable);
> +}
> +
> +static void handle___pkvm_swap_reader_tracing(struct kvm_cpu_context *host_ctxt)
> +{
> +	DECLARE_REG(unsigned int, cpu, host_ctxt, 1);
> +
> +	cpu_reg(host_ctxt, 1) = __pkvm_swap_reader_tracing(cpu);
> +}
> +
>  typedef void (*hcall_t)(struct kvm_cpu_context *);
>  
>  #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
> @@ -626,6 +656,10 @@ static const hcall_t host_hcall[] = {
>  	HANDLE_FUNC(__pkvm_vcpu_load),
>  	HANDLE_FUNC(__pkvm_vcpu_put),
>  	HANDLE_FUNC(__pkvm_tlb_flush_vmid),
> +	HANDLE_FUNC(__pkvm_load_tracing),
> +	HANDLE_FUNC(__pkvm_unload_tracing),
> +	HANDLE_FUNC(__pkvm_enable_tracing),
> +	HANDLE_FUNC(__pkvm_swap_reader_tracing),
>  };
>  
>  static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
> diff --git a/arch/arm64/kvm/hyp/nvhe/trace.c b/arch/arm64/kvm/hyp/nvhe/trace.c
> new file mode 100644
> index 000000000000..def5cbc75722
> --- /dev/null
> +++ b/arch/arm64/kvm/hyp/nvhe/trace.c
> @@ -0,0 +1,257 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (C) 2025 Google LLC
> + * Author: Vincent Donnefort <vdonnefort@...gle.com>
> + */
> +
> +#include <nvhe/clock.h>
> +#include <nvhe/mem_protect.h>
> +#include <nvhe/mm.h>
> +#include <nvhe/trace.h>
> +
> +#include <asm/percpu.h>
> +#include <asm/kvm_mmu.h>
> +#include <asm/local.h>
> +
> +#include <linux/simple_ring_buffer.h>
> +
> +static DEFINE_PER_CPU(struct simple_rb_per_cpu, __simple_rbs);
> +
> +static struct hyp_trace_buffer {
> +	struct simple_rb_per_cpu __percpu	*simple_rbs;
> +	unsigned long				bpages_backing_start;
> +	size_t					bpages_backing_size;
> +	hyp_spinlock_t				lock;
> +} trace_buffer = {
> +	.simple_rbs = &__simple_rbs,
> +	.lock = __HYP_SPIN_LOCK_UNLOCKED,
> +};
> +
> +static bool hyp_trace_buffer_loaded(struct hyp_trace_buffer *trace_buffer)
> +{
> +	return trace_buffer->bpages_backing_size > 0;
> +}
> +
> +void *tracing_reserve_entry(unsigned long length)
> +{
> +	return simple_ring_buffer_reserve(this_cpu_ptr(trace_buffer.simple_rbs), length,
> +					  trace_clock());
> +}
> +
> +void tracing_commit_entry(void)
> +{
> +	simple_ring_buffer_commit(this_cpu_ptr(trace_buffer.simple_rbs));
> +}
> +
> +static int hyp_trace_buffer_load_bpage_backing(struct hyp_trace_buffer *trace_buffer,
> +					       struct hyp_trace_desc *desc)
> +{
> +	unsigned long start = kern_hyp_va(desc->bpages_backing_start);
> +	size_t size = desc->bpages_backing_size;
> +	int ret;
> +
> +	if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size))
> +		return -EINVAL;
> +
> +	ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)start), size >> PAGE_SHIFT);
> +	if (ret)
> +		return ret;
> +
> +	memset((void *)start, 0, size);
> +
> +	trace_buffer->bpages_backing_start = start;
> +	trace_buffer->bpages_backing_size = size;
> +
> +	return 0;
> +}
> +
> +static void hyp_trace_buffer_unload_bpage_backing(struct hyp_trace_buffer *trace_buffer)
> +{
> +	unsigned long start = trace_buffer->bpages_backing_start;
> +	size_t size = trace_buffer->bpages_backing_size;
> +
> +	if (!size)
> +		return;
> +
> +	memset((void *)start, 0, size);
> +
> +	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(start), size >> PAGE_SHIFT));
> +
> +	trace_buffer->bpages_backing_start = 0;
> +	trace_buffer->bpages_backing_size = 0;
> +}
> +
> +static void *__pin_shared_page(unsigned long kern_va)
> +{
> +	void *va = kern_hyp_va((void *)kern_va);
> +
> +	return hyp_pin_shared_mem(va, va + PAGE_SIZE) ? NULL : va;
> +}
> +
> +static void __unpin_shared_page(void *va)
> +{
> +	hyp_unpin_shared_mem(va, va + PAGE_SIZE);
> +}
> +
> +static void hyp_trace_buffer_unload(struct hyp_trace_buffer *trace_buffer)
> +{
> +	int cpu;
> +
> +	hyp_assert_lock_held(&trace_buffer->lock);
> +
> +	if (!hyp_trace_buffer_loaded(trace_buffer))
> +		return;
> +
> +	for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
> +		__simple_ring_buffer_unload(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
> +					    __unpin_shared_page);
> +
> +	hyp_trace_buffer_unload_bpage_backing(trace_buffer);
> +}
> +
> +static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
> +				 struct hyp_trace_desc *desc)
> +{
> +	struct simple_buffer_page *bpages;
> +	struct ring_buffer_desc *rb_desc;
> +	int ret, cpu;
> +
> +	hyp_assert_lock_held(&trace_buffer->lock);
> +
> +	if (hyp_trace_buffer_loaded(trace_buffer))
> +		return -EINVAL;
> +
> +	ret = hyp_trace_buffer_load_bpage_backing(trace_buffer, desc);
> +	if (ret)
> +		return ret;
> +
> +	bpages = (struct simple_buffer_page *)trace_buffer->bpages_backing_start;
> +	for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
> +		ret = __simple_ring_buffer_init(per_cpu_ptr(trace_buffer->simple_rbs, cpu),
> +						bpages, rb_desc, __pin_shared_page,
> +						__unpin_shared_page);
> +		if (ret)
> +			break;
> +
> +		bpages += rb_desc->nr_page_va;
> +	}
> +
> +	if (ret)
> +		hyp_trace_buffer_unload(trace_buffer);
> +
> +	return ret;
> +}
> +
> +static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size)
> +{
> +	struct simple_buffer_page *bpages = (struct simple_buffer_page *)desc->bpages_backing_start;
> +	struct ring_buffer_desc *rb_desc;
> +	void *bpages_end, *desc_end;
> +	unsigned int cpu;
> +
> +	desc_end = (void *)desc + desc_size; /* __pkvm_host_donate_hyp validates desc_size */
> +
> +	bpages_end = (void *)desc->bpages_backing_start + desc->bpages_backing_size;
> +	if (bpages_end < (void *)desc->bpages_backing_start)
> +		return false;
> +
> +	for_each_ring_buffer_desc(rb_desc, cpu, &desc->trace_buffer_desc) {
> +		/* Can we read nr_page_va? */
> +		if ((void *)rb_desc + struct_size(rb_desc, page_va, 0) > desc_end)
> +			return false;
> +
> +		/* Overflow desc? */
> +		if ((void *)rb_desc + struct_size(rb_desc, page_va, rb_desc->nr_page_va) > desc_end)
> +			return false;
> +
> +		/* Overflow bpages backing memory? */
> +		if ((void *)(bpages + rb_desc->nr_page_va) > bpages_end)
> +			return false;
> +
> +		if (cpu >= hyp_nr_cpus)
> +			return false;
> +
> +		if (cpu != rb_desc->cpu)
> +			return false;
> +
> +		bpages += rb_desc->nr_page_va;
> +	}
> +
> +	return true;
> +}
> +
> +int __pkvm_load_tracing(unsigned long desc_hva, size_t desc_size)
> +{
> +	struct hyp_trace_desc *desc = (struct hyp_trace_desc *)kern_hyp_va(desc_hva);
> +	int ret;
> +
> +	if (!desc_size || !PAGE_ALIGNED(desc_hva) || !PAGE_ALIGNED(desc_size))
> +		return -EINVAL;
> +
> +	ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn((void *)desc),
> +				     desc_size >> PAGE_SHIFT);
> +	if (ret)
> +		return ret;
> +
> +	if (!hyp_trace_desc_validate(desc, desc_size))
> +		goto err_donate_desc;
> +
> +	hyp_spin_lock(&trace_buffer.lock);
> +
> +	ret = hyp_trace_buffer_load(&trace_buffer, desc);
> +
> +	hyp_spin_unlock(&trace_buffer.lock);
> +
> +err_donate_desc:
> +	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn((void *)desc),
> +				       desc_size >> PAGE_SHIFT));

That's basically a guaranteed panic if anything goes wrong. Are you
sure you want to do that?

> +	return ret;
> +}
> +
> +void __pkvm_unload_tracing(void)
> +{
> +	hyp_spin_lock(&trace_buffer.lock);
> +	hyp_trace_buffer_unload(&trace_buffer);
> +	hyp_spin_unlock(&trace_buffer.lock);
> +}
> +
> +int __pkvm_enable_tracing(bool enable)
> +{
> +	int cpu, ret = enable ? -EINVAL : 0;
> +
> +	hyp_spin_lock(&trace_buffer.lock);
> +
> +	if (!hyp_trace_buffer_loaded(&trace_buffer))
> +		goto unlock;
> +
> +	for (cpu = 0; cpu < hyp_nr_cpus; cpu++)
> +		simple_ring_buffer_enable_tracing(per_cpu_ptr(trace_buffer.simple_rbs, cpu),
> +						  enable);
> +
> +	ret = 0;
> +
> +unlock:
> +	hyp_spin_unlock(&trace_buffer.lock);
> +
> +	return ret;
> +}
> +
> +int __pkvm_swap_reader_tracing(unsigned int cpu)
> +{
> +	int ret;
> +
> +	if (cpu >= hyp_nr_cpus)
> +		return -EINVAL;
> +
> +	hyp_spin_lock(&trace_buffer.lock);
> +
> +	if (hyp_trace_buffer_loaded(&trace_buffer))
> +		ret = simple_ring_buffer_swap_reader_page(
> +				per_cpu_ptr(trace_buffer.simple_rbs, cpu));

Please keep these things on a single line. I don't care what people
(of checkpatch) say.

> +	else
> +		ret = -ENODEV;
> +
> +	hyp_spin_unlock(&trace_buffer.lock);
> +
> +	return ret;
> +}
> -- 
> 2.51.2.1041.gc1ab5b90ca-goog
> 
> 

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ