[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <861q69oi9c.wl-maz@kernel.org>
Date: Fri, 10 May 2024 15:26:23 +0100
From: Marc Zyngier <maz@...nel.org>
To: Colton Lewis <coltonlewis@...gle.com>
Cc: kvm@...r.kernel.org, Jonathan Corbet <corbet@....net>, Oliver
Upton <oliver.upton@...ux.dev>, James Morse <james.morse@....com>,
Suzuki K Poulose <suzuki.poulose@....com>, Zenghui Yu <yuzenghui@...wei.com>,
Catalin Marinas <catalin.marinas@....com>, Will Deacon <will@...nel.org>,
linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, kvmarm@...ts.linux.dev
Subject: Re: [PATCH v5] KVM: arm64: Add early_param to control WFx trapping
On Tue, 30 Apr 2024 19:14:44 +0100,
Colton Lewis <coltonlewis@...gle.com> wrote:
>
> Add an early_params to control WFI and WFE trapping. This is to
> control the degree guests can wait for interrupts on their own without
> being trapped by KVM. Options for each param are trap and notrap. trap
> enables the trap. notrap disables the trap. Absent an explicitly set
> policy, default to current behavior: disabling the trap if only a
> single task is running and enabling otherwise.
>
> Signed-off-by: Colton Lewis <coltonlewis@...gle.com>
> ---
> v5:
>
> * Move trap configuration to vcpu_reset_hcr(). This required moving
> kvm_emulate.h:vcpu_reset_hcr() to arm.c:kvm_vcpu_reset_hcr() to avoid needing
> to pull scheduler headers and my enums into kvm_emulate.h. I thought the
> function looked too bulky for that header anyway.
> * Delete vcpu_{set,clear}_vfx_traps helpers that are no longer used anywhere.
> * Remove documentation of explicit option for default behavior to avoid any
> implicit suggestion default behavior will stay that way.
>
> v4:
> https://lore.kernel.org/kvmarm/20240422181716.237284-1-coltonlewis@google.com/
>
> v3:
> https://lore.kernel.org/kvmarm/20240410175437.793508-1-coltonlewis@google.com/
>
> v2:
> https://lore.kernel.org/kvmarm/20240319164341.1674863-1-coltonlewis@google.com/
>
> v1:
> https://lore.kernel.org/kvmarm/20240129213918.3124494-1-coltonlewis@google.com/
>
> .../admin-guide/kernel-parameters.txt | 16 +++
> arch/arm64/include/asm/kvm_emulate.h | 53 ---------
> arch/arm64/include/asm/kvm_host.h | 7 ++
> arch/arm64/kvm/arm.c | 110 +++++++++++++++++-
> 4 files changed, 127 insertions(+), 59 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 31b3a25680d0..a4d94d9abbe4 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2653,6 +2653,22 @@
> [KVM,ARM] Allow use of GICv4 for direct injection of
> LPIs.
>
> + kvm-arm.wfe_trap_policy=
> + [KVM,ARM] Control when to set WFE instruction trap for
> + KVM VMs.
> +
> + trap: set WFE instruction trap
> +
> + notrap: clear WFE instruction trap
> +
> + kvm-arm.wfi_trap_policy=
> + [KVM,ARM] Control when to set WFI instruction trap for
> + KVM VMs.
> +
> + trap: set WFI instruction trap
> +
> + notrap: clear WFI instruction trap
> +
Please make it clear that neither traps are guaranteed. The
architecture *allows* an implementation to trap when no events (resp.
interrupts) are pending, but nothing more. An implementation is
perfectly allowed to ignore these bits.
> kvm_cma_resv_ratio=n [PPC]
> Reserves given percentage from system memory area for
> contiguous memory allocation for KVM hash pagetable
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index b804fe832184..c2a9a409ebfe 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -67,64 +67,11 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> }
> #endif
>
> -static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
> -{
> - vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
> - if (has_vhe() || has_hvhe())
> - vcpu->arch.hcr_el2 |= HCR_E2H;
> - if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
> - /* route synchronous external abort exceptions to EL2 */
> - vcpu->arch.hcr_el2 |= HCR_TEA;
> - /* trap error record accesses */
> - vcpu->arch.hcr_el2 |= HCR_TERR;
> - }
> -
> - if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) {
> - vcpu->arch.hcr_el2 |= HCR_FWB;
> - } else {
> - /*
> - * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
> - * get set in SCTLR_EL1 such that we can detect when the guest
> - * MMU gets turned on and do the necessary cache maintenance
> - * then.
> - */
> - vcpu->arch.hcr_el2 |= HCR_TVM;
> - }
> -
> - if (cpus_have_final_cap(ARM64_HAS_EVT) &&
> - !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
> - vcpu->arch.hcr_el2 |= HCR_TID4;
> - else
> - vcpu->arch.hcr_el2 |= HCR_TID2;
> -
> - if (vcpu_el1_is_32bit(vcpu))
> - vcpu->arch.hcr_el2 &= ~HCR_RW;
> -
> - if (kvm_has_mte(vcpu->kvm))
> - vcpu->arch.hcr_el2 |= HCR_ATA;
> -}
> -
> static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> {
> return (unsigned long *)&vcpu->arch.hcr_el2;
> }
>
> -static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu)
> -{
> - vcpu->arch.hcr_el2 &= ~HCR_TWE;
> - if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
> - vcpu->kvm->arch.vgic.nassgireq)
> - vcpu->arch.hcr_el2 &= ~HCR_TWI;
> - else
> - vcpu->arch.hcr_el2 |= HCR_TWI;
> -}
> -
> -static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu)
> -{
> - vcpu->arch.hcr_el2 |= HCR_TWE;
> - vcpu->arch.hcr_el2 |= HCR_TWI;
> -}
> -
> static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu)
> {
> vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 21c57b812569..315ee7bfc1cb 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -67,6 +67,13 @@ enum kvm_mode {
> KVM_MODE_NV,
> KVM_MODE_NONE,
> };
> +
> +enum kvm_wfx_trap_policy {
> + KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
> + KVM_WFX_NOTRAP,
> + KVM_WFX_TRAP,
> +};
Since this is only ever used in arm.c, it really doesn't need to be
exposed anywhere else.
> +
> #ifdef CONFIG_KVM
> enum kvm_mode kvm_get_mode(void);
> #else
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index a25265aca432..5ec52333e042 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -46,6 +46,8 @@
> #include <kvm/arm_psci.h>
>
> static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
> +static enum kvm_wfx_trap_policy kvm_wfi_trap_policy = KVM_WFX_NOTRAP_SINGLE_TASK;
> +static enum kvm_wfx_trap_policy kvm_wfe_trap_policy = KVM_WFX_NOTRAP_SINGLE_TASK;
It would be worth declaring those as __read_mostly.
>
> DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
>
> @@ -456,11 +458,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
> kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
>
> - if (single_task_running())
> - vcpu_clear_wfx_traps(vcpu);
> - else
> - vcpu_set_wfx_traps(vcpu);
> -
> if (vcpu_has_ptrauth(vcpu))
> vcpu_ptrauth_disable(vcpu);
> kvm_arch_vcpu_load_debug_state_flags(vcpu);
> @@ -1391,6 +1388,72 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> +static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
> +{
> + if (likely(kvm_wfi_trap_policy == KVM_WFX_NOTRAP_SINGLE_TASK))
> + return single_task_running() &&
> + (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
> + vcpu->kvm->arch.vgic.nassgireq);
So you are evaluating a runtime condition (scheduler queue length,
number of LPIs)...
> +
> + return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
> +}
> +
> +static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
> +{
> + if (likely(kvm_wfe_trap_policy == KVM_WFX_NOTRAP_SINGLE_TASK))
> + return single_task_running();
> +
> + return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
> +}
> +
> +static inline void kvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu)
Why the inline?
> +{
> + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
> + if (has_vhe() || has_hvhe())
> + vcpu->arch.hcr_el2 |= HCR_E2H;
> + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
> + /* route synchronous external abort exceptions to EL2 */
> + vcpu->arch.hcr_el2 |= HCR_TEA;
> + /* trap error record accesses */
> + vcpu->arch.hcr_el2 |= HCR_TERR;
> + }
> +
> + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) {
> + vcpu->arch.hcr_el2 |= HCR_FWB;
> + } else {
> + /*
> + * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
> + * get set in SCTLR_EL1 such that we can detect when the guest
> + * MMU gets turned on and do the necessary cache maintenance
> + * then.
> + */
> + vcpu->arch.hcr_el2 |= HCR_TVM;
> + }
> +
> + if (cpus_have_final_cap(ARM64_HAS_EVT) &&
> + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
> + vcpu->arch.hcr_el2 |= HCR_TID4;
> + else
> + vcpu->arch.hcr_el2 |= HCR_TID2;
> +
> + if (vcpu_el1_is_32bit(vcpu))
> + vcpu->arch.hcr_el2 &= ~HCR_RW;
> +
> + if (kvm_has_mte(vcpu->kvm))
> + vcpu->arch.hcr_el2 |= HCR_ATA;
> +
> +
> + if (kvm_vcpu_should_clear_twe(vcpu))
> + vcpu->arch.hcr_el2 &= ~HCR_TWE;
> + else
> + vcpu->arch.hcr_el2 |= HCR_TWE;
> +
> + if (kvm_vcpu_should_clear_twi(vcpu))
> + vcpu->arch.hcr_el2 &= ~HCR_TWI;
> + else
> + vcpu->arch.hcr_el2 |= HCR_TWI;
.. and from the above runtime conditions you make it a forever
decision, for a vcpu that still hasn't executed a single instruction.
What could possibly go wrong?
> +}
> +
> static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
> struct kvm_vcpu_init *init)
> {
> @@ -1427,7 +1490,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
> icache_inval_all_pou();
> }
>
> - vcpu_reset_hcr(vcpu);
> + kvm_vcpu_reset_hcr(vcpu);
> vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
>
> /*
> @@ -2654,6 +2717,41 @@ static int __init early_kvm_mode_cfg(char *arg)
> }
> early_param("kvm-arm.mode", early_kvm_mode_cfg);
>
> +static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
> +{
> + if (!arg)
> + return -EINVAL;
> +
> + if (strcmp(arg, "trap") == 0) {
> + *p = KVM_WFX_TRAP;
> + return 0;
> + }
> +
> + if (strcmp(arg, "notrap") == 0) {
> + *p = KVM_WFX_NOTRAP;
> + return 0;
> + }
> +
> + if (strcmp(arg, "default") == 0) {
> + *p = KVM_WFX_NOTRAP_SINGLE_TASK;
> + return 0;
> + }
Where is this "default" coming from? It's not documented.
M.
--
Without deviation from the norm, progress is not possible.
Powered by blists - more mailing lists