[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87ldjyf3ei.wl-maz@kernel.org>
Date: Sat, 22 Nov 2025 16:17:09 +0000
From: Marc Zyngier <maz@...nel.org>
To: Tian Zheng <zhengtian10@...wei.com>
Cc: <oliver.upton@...ux.dev>,
<catalin.marinas@....com>,
<corbet@....net>,
<pbonzini@...hat.com>,
<will@...nel.org>,
<linux-kernel@...r.kernel.org>,
<yuzenghui@...wei.com>,
<wangzhou1@...ilicon.com>,
<yezhenyu2@...wei.com>,
<xiexiangyou@...wei.com>,
<zhengchuan@...wei.com>,
<linuxarm@...wei.com>,
<joey.gouly@....com>,
<kvmarm@...ts.linux.dev>,
<kvm@...r.kernel.org>,
<linux-arm-kernel@...ts.infradead.org>,
<linux-doc@...r.kernel.org>,
<suzuki.poulose@....com>
Subject: Re: [PATCH v2 4/5] KVM: arm64: Enable HDBSS support and handle HDBSSF events
On Fri, 21 Nov 2025 09:23:41 +0000,
Tian Zheng <zhengtian10@...wei.com> wrote:
>
> From: eillon <yezhenyu2@...wei.com>
>
> Implement the HDBSS enable/disable functionality using the
> KVM_CAP_ARM_HW_DIRTY_STATE_TRACK ioctl.
>
> Userspace (e.g., QEMU) can enable HDBSS by invoking the ioctl
> at the start of live migration, configuring the buffer size.
> The feature is disabled by invoking the ioctl again with size
> set to 0 once migration completes.
>
> Add support for updating the dirty bitmap based on the HDBSS
> buffer. Similar to the x86 PML implementation, KVM flushes the
> buffer on all VM-Exits, so running vCPUs only need to be kicked
> to force a VM-Exit.
Drop the x86 reference, nobody cares about other architectures.
Instead, please describe what state is handled where. Having to
reverse engineer this is particularly painful.
>
> Signed-off-by: eillon <yezhenyu2@...wei.com>
> Signed-off-by: Tian Zheng <zhengtian10@...wei.com>
> ---
> arch/arm64/include/asm/kvm_host.h | 10 +++
> arch/arm64/include/asm/kvm_mmu.h | 17 +++++
> arch/arm64/kvm/arm.c | 107 ++++++++++++++++++++++++++++++
> arch/arm64/kvm/handle_exit.c | 45 +++++++++++++
> arch/arm64/kvm/hyp/vhe/switch.c | 1 +
> arch/arm64/kvm/mmu.c | 10 +++
> arch/arm64/kvm/reset.c | 3 +
> include/linux/kvm_host.h | 1 +
> 8 files changed, 194 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index d962932f0e5f..408e4c2b3d1a 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -87,6 +87,7 @@ int __init kvm_arm_init_sve(void);
> u32 __attribute_const__ kvm_target_cpu(void);
> void kvm_reset_vcpu(struct kvm_vcpu *vcpu);
> void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
> +void kvm_arm_vcpu_free_hdbss(struct kvm_vcpu *vcpu);
>
> struct kvm_hyp_memcache {
> phys_addr_t head;
> @@ -793,6 +794,12 @@ struct vcpu_reset_state {
> bool reset;
> };
>
> +struct vcpu_hdbss_state {
> + phys_addr_t base_phys;
> + u32 size;
> + u32 next_index;
> +};
> +
> struct vncr_tlb;
>
> struct kvm_vcpu_arch {
> @@ -897,6 +904,9 @@ struct kvm_vcpu_arch {
>
> /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */
> struct vncr_tlb *vncr_tlb;
> +
> + /* HDBSS registers info */
> + struct vcpu_hdbss_state hdbss;
> };
>
> /*
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index e4069f2ce642..6ace1080aed5 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -331,6 +331,23 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
> asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
> }
>
> +static __always_inline void __load_hdbss(struct kvm_vcpu *vcpu)
Why is this in an include file? If, as it appears, it is VHE only
(this patch completely ignores the nVHE code path), it should be
strictly local to the VHE code.
> +{
> + struct kvm *kvm = vcpu->kvm;
> + u64 br_el2, prod_el2;
> +
> + if (!kvm->enable_hdbss)
> + return;
Why is enable_hdbss in the *arch-independent* part of the kvm
structure?
> +
> + br_el2 = HDBSSBR_EL2(vcpu->arch.hdbss.base_phys, vcpu->arch.hdbss.size);
> + prod_el2 = vcpu->arch.hdbss.next_index;
> +
> + write_sysreg_s(br_el2, SYS_HDBSSBR_EL2);
> + write_sysreg_s(prod_el2, SYS_HDBSSPROD_EL2);
> +
> + isb();
> +}
> +
> static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
> {
> return container_of(mmu->arch, struct kvm, arch);
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 870953b4a8a7..64f65e3c2a89 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -79,6 +79,92 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
> }
>
> +void kvm_arm_vcpu_free_hdbss(struct kvm_vcpu *vcpu)
> +{
> + struct page *hdbss_pg = NULL;
Why the NULL init?
> +
> + hdbss_pg = phys_to_page(vcpu->arch.hdbss.base_phys);
> + if (hdbss_pg)
> + __free_pages(hdbss_pg, vcpu->arch.hdbss.size);
> +
> + vcpu->arch.hdbss = (struct vcpu_hdbss_state) {
> + .base_phys = 0,
> + .size = 0,
> + .next_index = 0,
> + };
It should be enough to set size to 0.
> +}
> +
> +static int kvm_cap_arm_enable_hdbss(struct kvm *kvm,
> + struct kvm_enable_cap *cap)
> +{
> + unsigned long i;
> + struct kvm_vcpu *vcpu;
> + struct page *hdbss_pg = NULL;
> + int size = cap->args[0];
This is the wrong type. Use unsigned types, preferably the same as the
userspace structure.
> + int ret = 0;
> +
> + if (!system_supports_hdbss()) {
> + kvm_err("This system does not support HDBSS!\n");
No. We don't print *anything* on error.
> + return -EINVAL;
> + }
> +
> + if (size < 0 || size > HDBSS_MAX_SIZE) {
> + kvm_err("Invalid HDBSS buffer size: %d!\n", size);
> + return -EINVAL;
Same thing. Please remove *all* the kvm_err() crap.
And with an unsigned type, you avoid the < 0 compare.
> + }
> +
> + /* Enable the HDBSS feature if size > 0, otherwise disable it. */
> + if (size) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + hdbss_pg = alloc_pages(GFP_KERNEL_ACCOUNT, size);
> + if (!hdbss_pg) {
> + kvm_err("Alloc HDBSS buffer failed!\n");
> + ret = -ENOMEM;
> + goto error_alloc;
> + }
> +
> + vcpu->arch.hdbss = (struct vcpu_hdbss_state) {
> + .base_phys = page_to_phys(hdbss_pg),
> + .size = size,
> + .next_index = 0,
> + };
> + }
> +
> + kvm->enable_hdbss = true;
> + kvm->arch.mmu.vtcr |= VTCR_EL2_HD | VTCR_EL2_HDBSS;
> +
> + /*
> + * We should kick vcpus out of guest mode here to load new
> + * vtcr value to vtcr_el2 register when re-enter guest mode.
> + */
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + kvm_vcpu_kick(vcpu);
I don't think this is correct. You should start by stopping all vcpus,
install the logging on all of them, and only then restart them.
Otherwise, your error handling is totally broken. You can end-up
freeing memory that is in active use!
> + } else if (kvm->enable_hdbss) {
> + kvm->arch.mmu.vtcr &= ~(VTCR_EL2_HD | VTCR_EL2_HDBSS);
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + /* Kick vcpus to flush hdbss buffer. */
> + kvm_vcpu_kick(vcpu);
> +
> + kvm_arm_vcpu_free_hdbss(vcpu);
> + }
Same thing.
> +
> + kvm->enable_hdbss = false;
> + }
> +
> + return ret;
> +
> +error_alloc:
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!vcpu->arch.hdbss.base_phys && !vcpu->arch.hdbss.size)
> + continue;
> +
> + kvm_arm_vcpu_free_hdbss(vcpu);
> + }
> +
> + return ret;
> +}
> +
> int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> struct kvm_enable_cap *cap)
> {
> @@ -132,6 +218,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> }
> mutex_unlock(&kvm->lock);
> break;
> + case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
> + mutex_lock(&kvm->lock);
> + r = kvm_cap_arm_enable_hdbss(kvm, cap);
> + mutex_unlock(&kvm->lock);
> + break;
> default:
> break;
> }
> @@ -420,6 +511,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> r = kvm_supports_cacheable_pfnmap();
> break;
>
> + case KVM_CAP_ARM_HW_DIRTY_STATE_TRACK:
> + r = system_supports_hdbss();
> + break;
> default:
> r = 0;
> }
> @@ -1837,7 +1931,20 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
> void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> {
> + /*
> + * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
> + * before reporting dirty_bitmap to userspace. KVM flushes the buffers
> + * on all VM-Exits, thus we only need to kick running vCPUs to force a
> + * VM-Exit.
> + */
> + struct kvm_vcpu *vcpu;
> + unsigned long i;
>
> + if (!kvm->enable_hdbss)
> + return;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + kvm_vcpu_kick(vcpu);
And then what? You return to userspace while the stuff is being
written, without any synchronisation? How does userspace know that the
state is consistent?
Also, I'm not sure you can ignore the memslot. It *is* relevant.
> }
>
> static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> index cc7d5d1709cb..9ba0ea6305ef 100644
> --- a/arch/arm64/kvm/handle_exit.c
> +++ b/arch/arm64/kvm/handle_exit.c
> @@ -412,6 +412,49 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
> return arm_exit_handlers[esr_ec];
> }
>
> +static void kvm_flush_hdbss_buffer(struct kvm_vcpu *vcpu)
> +{
> + int idx, curr_idx;
> + u64 *hdbss_buf;
> + struct kvm *kvm = vcpu->kvm;
> + u64 br_el2;
> +
> + if (!kvm->enable_hdbss)
> + return;
> +
> + dsb(sy);
> + isb();
What are these barriers for?
> + curr_idx = HDBSSPROD_IDX(read_sysreg_s(SYS_HDBSSPROD_EL2));
> + br_el2 = HDBSSBR_EL2(vcpu->arch.hdbss.base_phys, vcpu->arch.hdbss.size);
> +
> + /* Do nothing if HDBSS buffer is empty or br_el2 is NULL */
> + if (curr_idx == 0 || br_el2 == 0)
> + return;
> +
> + hdbss_buf = page_address(phys_to_page(vcpu->arch.hdbss.base_phys));
> + if (!hdbss_buf) {
> + kvm_err("Enter flush hdbss buffer with buffer == NULL!");
> + return;
> + }
> +
> + guard(write_lock_irqsave)(&vcpu->kvm->mmu_lock);
> + for (idx = 0; idx < curr_idx; idx++) {
> + u64 gpa;
> +
> + gpa = hdbss_buf[idx];
> + if (!(gpa & HDBSS_ENTRY_VALID))
> + continue;
> +
> + gpa &= HDBSS_ENTRY_IPA;
> + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
> + }
> +
> + /* reset HDBSS index */
> + write_sysreg_s(0, SYS_HDBSSPROD_EL2);
> + vcpu->arch.hdbss.next_index = 0;
> + isb();
> +}
> +
> /*
> * We may be single-stepping an emulated instruction. If the emulation
> * has been completed in the kernel, we can return to userspace with a
> @@ -447,6 +490,8 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
> {
> struct kvm_run *run = vcpu->run;
>
> + kvm_flush_hdbss_buffer(vcpu);
> +
> if (ARM_SERROR_PENDING(exception_index)) {
> /*
> * The SError is handled by handle_exit_early(). If the guest
> diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
> index 9984c492305a..3787c9c5810d 100644
> --- a/arch/arm64/kvm/hyp/vhe/switch.c
> +++ b/arch/arm64/kvm/hyp/vhe/switch.c
> @@ -220,6 +220,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu)
> __vcpu_load_switch_sysregs(vcpu);
> __vcpu_load_activate_traps(vcpu);
> __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
> + __load_hdbss(vcpu);
> }
>
> void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 7cc964af8d30..91a2f9dbb406 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1843,6 +1843,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> if (writable)
> prot |= KVM_PGTABLE_PROT_W;
>
> + if (writable && kvm->enable_hdbss && logging_active)
> + prot |= KVM_PGTABLE_PROT_DBM;
> +
So all the pages are dirty by default, but the logger can't see it.
This is really broken, and I think you haven't really tested this
code.
> if (exec_fault)
> prot |= KVM_PGTABLE_PROT_X;
>
> @@ -1950,6 +1953,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>
> is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
>
> + /*
> + * HDBSS buffer already flushed when enter handle_trap_exceptions().
> + * Nothing to do here.
> + */
> + if (ESR_ELx_ISS2(esr) & ESR_ELx_HDBSSF)
> + return 1;
> +
Flushing the buffer doesn't solve a potential error state. And what if
you have overflowed the buffer? There is a lot of things that can
happen as a result of R_STJMN, and I don't see any of that being
described here.
> if (esr_fsc_is_translation_fault(esr)) {
> /* Beyond sanitised PARange (which is the IPA limit) */
> if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 959532422d3a..65e8f890f863 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -161,6 +161,9 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
> free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
> kfree(vcpu->arch.vncr_tlb);
> kfree(vcpu->arch.ccsidr);
> +
> + if (vcpu->arch.hdbss.base_phys || vcpu->arch.hdbss.size)
> + kvm_arm_vcpu_free_hdbss(vcpu);
> }
>
> static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5bd76cf394fa..aa8138604b1e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -876,6 +876,7 @@ struct kvm {
> struct xarray mem_attr_array;
> #endif
> char stats_id[KVM_STATS_NAME_SIZE];
> + bool enable_hdbss;
Well, no.
This is broken in a number of ways, and I haven't considered the UAPI
yet. The error handling is non-existent, and this will actively
corrupt memory (again). Marking writable pages as writable actively
prevents logging, meaning that you will happily miss dirty pages.
I'll stop reviewing this series here.
M.
--
Jazz isn't dead. It just smells funny.
Powered by blists - more mailing lists