linux-kernel - Re: [PATCH 4/4] arm64: mte: Use stage-2 NoTagAccess memory attribute if supported

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87o734ts4m.wl-maz@kernel.org>
Date: Mon, 28 Oct 2024 10:54:17 +0000
From: Marc Zyngier <maz@...nel.org>
To: "Aneesh Kumar K.V (Arm)" <aneesh.kumar@...nel.org>
Cc: linux-kernel@...r.kernel.org,
	linux-arm-kernel@...ts.infradead.org,
	kvmarm@...ts.linux.dev,
	Suzuki K Poulose <Suzuki.Poulose@....com>,
	Steven Price <steven.price@....com>,
	Will Deacon <will@...nel.org>,
	Catalin Marinas <catalin.marinas@....com>,
	Mark Rutland <mark.rutland@....com>,
	Oliver Upton <oliver.upton@...ux.dev>,
	Joey Gouly <joey.gouly@....com>,
	Zenghui Yu <yuzenghui@...wei.com>
Subject: Re: [PATCH 4/4] arm64: mte: Use stage-2 NoTagAccess memory attribute if supported

On Mon, 28 Oct 2024 09:40:14 +0000,
"Aneesh Kumar K.V (Arm)" <aneesh.kumar@...nel.org> wrote:
> 
> Currently, the kernel won't start a guest if the MTE feature is enabled
> and the guest RAM is backed by memory which doesn't support access tags.
> Update this such that the kernel uses the NoTagAccess memory attribute
> while mapping pages from VMAs for which MTE is not allowed. The fault
> from accessing the access tags with such pages is forwarded to VMM so
> that VMM can decide to kill the guest or remap the pages so that
> access tag storage is allowed.

I only have questions here:

- what is the benefit of such approach? why shouldn't that be the
  kernel's job to fix it?

- where is the documentation for this new userspace ABI?

- are you expecting the VMM to create a new memslot for this?

- where is the example of a VMM using this?

> 
> NOTE: We could also use KVM_EXIT_MEMORY_FAULT for this. I chose to
> add a new EXIT type because this is arm64 specific exit type.
> 
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@...nel.org>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  5 +++++
>  arch/arm64/include/asm/kvm_pgtable.h |  1 +
>  arch/arm64/kvm/hyp/pgtable.c         | 16 +++++++++++++---
>  arch/arm64/kvm/mmu.c                 | 28 ++++++++++++++++++++++------
>  include/uapi/linux/kvm.h             |  7 +++++++
>  5 files changed, 48 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index a601a9305b10..fa0149a0606a 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -373,6 +373,11 @@ static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
>  	return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
>  }
>  
> +static inline bool kvm_vcpu_trap_is_tagaccess(const struct kvm_vcpu *vcpu)
> +{
> +	return !!(ESR_ELx_ISS2(kvm_vcpu_get_esr(vcpu)) & ESR_ELx_TagAccess);
> +}
> +
>  static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
>  {
>  	return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
> diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
> index 03f4c3d7839c..5657ac1998ad 100644
> --- a/arch/arm64/include/asm/kvm_pgtable.h
> +++ b/arch/arm64/include/asm/kvm_pgtable.h
> @@ -252,6 +252,7 @@ enum kvm_pgtable_prot {
>  
>  	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
>  	KVM_PGTABLE_PROT_NORMAL_NC		= BIT(4),
> +	KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS	= BIT(5),

This seems wrong. NOTAGACCESS is a *permission*, not a memory type.

>  
>  	KVM_PGTABLE_PROT_SW0			= BIT(55),
>  	KVM_PGTABLE_PROT_SW1			= BIT(56),
> diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
> index b11bcebac908..bc0d9f08c49a 100644
> --- a/arch/arm64/kvm/hyp/pgtable.c
> +++ b/arch/arm64/kvm/hyp/pgtable.c
> @@ -677,9 +677,11 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
>  {
>  	kvm_pte_t attr;
>  	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
> +	unsigned long prot_mask = KVM_PGTABLE_PROT_DEVICE |
> +				  KVM_PGTABLE_PROT_NORMAL_NC |
> +				  KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS;
>  
> -	switch (prot & (KVM_PGTABLE_PROT_DEVICE |
> -			KVM_PGTABLE_PROT_NORMAL_NC)) {
> +	switch (prot & prot_mask) {
>  	case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
>  		return -EINVAL;
>  	case KVM_PGTABLE_PROT_DEVICE:
> @@ -692,6 +694,12 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
>  			return -EINVAL;
>  		attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
>  		break;
> +	case KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS:
> +		if (system_supports_notagaccess())
> +			attr = KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS);
> +		else
> +			return -EINVAL;
> +		break;

How do you see this working when migrating a VM from one host to
another, one that supports FEAT_MTE_PERM and one that doesn't? The
current assumptions are that the VMM will replay the *exact same*
setup on the target host, and this obviously doesn't work.

>  	default:
>  		attr = KVM_S2_MEMATTR(pgt, NORMAL);
>  	}
> @@ -872,7 +880,9 @@ static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
>  static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
>  {
>  	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
> -	return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
> +	return kvm_pte_valid(pte) &&
> +	       ((memattr == KVM_S2_MEMATTR(pgt, NORMAL)) ||
> +		(memattr == KVM_S2_MEMATTR(pgt, NORMAL_NOTAGACCESS)));
>  }
>  
>  static bool stage2_pte_executable(kvm_pte_t pte)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index b5824e93cee0..e56c6996332e 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1647,12 +1647,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  		 *  not a permission fault implies a translation fault which
>  		 *  means mapping the page for the first time
>  		 */
> -		if (mte_allowed) {
> +		if (mte_allowed)
>  			sanitise_mte_tags(kvm, pfn, vma_pagesize);
> -		} else {
> -			ret = -EFAULT;
> -			goto out_unlock;
> -		}
> +		else
> +			prot |= KVM_PGTABLE_PROT_NORMAL_NOTAGACCESS;
>  	}
>  
>  	if (writable)
> @@ -1721,6 +1719,15 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
>  		kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
>  }
>  
> +static inline void kvm_prepare_notagaccess_exit(struct kvm_vcpu *vcpu,
> +						 gpa_t gpa, gpa_t size)
> +{
> +	vcpu->run->exit_reason = KVM_EXIT_ARM_NOTAG_ACCESS;
> +	vcpu->run->notag_access.flags = 0;
> +	vcpu->run->notag_access.gpa = gpa;
> +	vcpu->run->notag_access.size = size;

Why does size matter here? It seems pretty pointless.

> +}
> +
>  /**
>   * kvm_handle_guest_abort - handles all 2nd stage aborts
>   * @vcpu:	the VCPU pointer
> @@ -1833,6 +1840,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  
>  	gfn = ipa >> PAGE_SHIFT;
>  	memslot = gfn_to_memslot(vcpu->kvm, gfn);
> +
> +	if (kvm_vcpu_trap_is_tagaccess(vcpu)) {
> +		/* exit to host and handle the error */
> +		kvm_prepare_notagaccess_exit(vcpu, gfn << PAGE_SHIFT, PAGE_SIZE);
> +		ret = 0;
> +		goto out;
> +	}
> +
>  	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
>  	write_fault = kvm_is_write_fault(vcpu);
>  	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
> @@ -2145,7 +2160,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  		if (!vma)
>  			break;
>  
> -		if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
> +		if (kvm_has_mte(kvm) && !system_supports_notagaccess() &&
> +		    !kvm_vma_mte_allowed(vma)) {
>  			ret = -EINVAL;
>  			break;
>  		}
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 637efc055145..a8268a164c4d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -178,6 +178,7 @@ struct kvm_xen_exit {
>  #define KVM_EXIT_NOTIFY           37
>  #define KVM_EXIT_LOONGARCH_IOCSR  38
>  #define KVM_EXIT_MEMORY_FAULT     39
> +#define KVM_EXIT_ARM_NOTAG_ACCESS 40
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -446,6 +447,12 @@ struct kvm_run {
>  			__u64 gpa;
>  			__u64 size;
>  		} memory_fault;
> +		/* KVM_EXIT_ARM_NOTAG_ACCESS */
> +		struct {
> +			__u64 flags;
> +			__u64 gpa;
> +			__u64 size;
> +		} notag_access;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};

How do you plan to handle the same thing for NV?

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.