[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <80e2dc67-9dca-4e90-8a42-21ddea329c53@arm.com>
Date: Thu, 22 Aug 2024 16:40:08 +0100
From: Steven Price <steven.price@....com>
To: "Aneesh Kumar K.V" <aneesh.kumar@...nel.org>, kvm@...r.kernel.org,
kvmarm@...ts.linux.dev
Cc: Catalin Marinas <catalin.marinas@....com>, Marc Zyngier <maz@...nel.org>,
Will Deacon <will@...nel.org>, James Morse <james.morse@....com>,
Oliver Upton <oliver.upton@...ux.dev>,
Suzuki K Poulose <suzuki.poulose@....com>, Zenghui Yu
<yuzenghui@...wei.com>, linux-arm-kernel@...ts.infradead.org,
linux-kernel@...r.kernel.org, Joey Gouly <joey.gouly@....com>,
Alexandru Elisei <alexandru.elisei@....com>,
Christoffer Dall <christoffer.dall@....com>, Fuad Tabba <tabba@...gle.com>,
linux-coco@...ts.linux.dev,
Ganapatrao Kulkarni <gankulkarni@...amperecomputing.com>,
Gavin Shan <gshan@...hat.com>, Shanker Donthineni <sdonthineni@...dia.com>,
Alper Gun <alpergun@...gle.com>
Subject: Re: [PATCH v4 21/43] arm64: RME: Runtime faulting of memory
On 22/08/2024 04:50, Aneesh Kumar K.V wrote:
> Steven Price <steven.price@....com> writes:
>
>> At runtime if the realm guest accesses memory which hasn't yet been
>> mapped then KVM needs to either populate the region or fault the guest.
>>
>> For memory in the lower (protected) region of IPA a fresh page is
>> provided to the RMM which will zero the contents. For memory in the
>> upper (shared) region of IPA, the memory from the memslot is mapped
>> into the realm VM non secure.
>>
>> Signed-off-by: Steven Price <steven.price@....com>
>> ---
>> Changes since v2:
>> * Avoid leaking memory if failing to map it in the realm.
>> * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>> * Adapt to changes in previous patches.
>> ---
>> arch/arm64/include/asm/kvm_emulate.h | 10 ++
>> arch/arm64/include/asm/kvm_rme.h | 10 ++
>> arch/arm64/kvm/mmu.c | 120 +++++++++++++++-
>> arch/arm64/kvm/rme.c | 205 +++++++++++++++++++++++++--
>> 4 files changed, 325 insertions(+), 20 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index 7430c77574e3..0b50572d3719 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -710,6 +710,16 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
>> return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
>> }
>>
>> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
>> +{
>> + if (kvm_is_realm(kvm)) {
>> + struct realm *realm = &kvm->arch.realm;
>> +
>> + return BIT(realm->ia_bits - 1);
>> + }
>> + return 0;
>> +}
>> +
>> static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>> {
>> if (static_branch_unlikely(&kvm_rme_is_available))
>> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
>> index 0e44b20cfa48..c50854f44674 100644
>> --- a/arch/arm64/include/asm/kvm_rme.h
>> +++ b/arch/arm64/include/asm/kvm_rme.h
>> @@ -103,6 +103,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>> unsigned long ipa,
>> u64 size,
>> bool unmap_private);
>> +int realm_map_protected(struct realm *realm,
>> + unsigned long base_ipa,
>> + struct page *dst_page,
>> + unsigned long map_size,
>> + struct kvm_mmu_memory_cache *memcache);
>> +int realm_map_non_secure(struct realm *realm,
>> + unsigned long ipa,
>> + struct page *page,
>> + unsigned long map_size,
>> + struct kvm_mmu_memory_cache *memcache);
>> int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>> unsigned long addr, unsigned long end,
>> unsigned long ripas,
>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>> index 620d26810019..eb8b8d013f3e 100644
>> --- a/arch/arm64/kvm/mmu.c
>> +++ b/arch/arm64/kvm/mmu.c
>> @@ -325,8 +325,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>>
>> lockdep_assert_held_write(&kvm->mmu_lock);
>> WARN_ON(size & ~PAGE_MASK);
>> - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
>> - may_block));
>> +
>> + if (kvm_is_realm(kvm))
>> + kvm_realm_unmap_range(kvm, start, size, !only_shared);
>> + else
>> + WARN_ON(stage2_apply_range(mmu, start, end,
>> + kvm_pgtable_stage2_unmap,
>> + may_block));
>> }
>>
>> void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
>> @@ -345,7 +350,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>> phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>> phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>>
>> - kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>> + if (kvm_is_realm(kvm))
>> + kvm_realm_unmap_range(kvm, addr, end - addr, false);
>> + else
>> + kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>> }
>>
>> /**
>> @@ -1037,6 +1045,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>> struct kvm_memory_slot *memslot;
>> int idx, bkt;
>>
>> + /* For realms this is handled by the RMM so nothing to do here */
>> + if (kvm_is_realm(kvm))
>> + return;
>> +
>> idx = srcu_read_lock(&kvm->srcu);
>> mmap_read_lock(current->mm);
>> write_lock(&kvm->mmu_lock);
>> @@ -1062,6 +1074,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>> if (kvm_is_realm(kvm) &&
>> (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>> kvm_realm_state(kvm) != REALM_STATE_NONE)) {
>> + kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK);
>> write_unlock(&kvm->mmu_lock);
>> kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>> return;
>> @@ -1428,6 +1441,71 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>> return vma->vm_flags & VM_MTE_ALLOWED;
>> }
>>
>> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
>> + kvm_pfn_t pfn, unsigned long map_size,
>> + enum kvm_pgtable_prot prot,
>> + struct kvm_mmu_memory_cache *memcache)
>> +{
>> + struct realm *realm = &kvm->arch.realm;
>> + struct page *page = pfn_to_page(pfn);
>> +
>> + if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
>> + return -EFAULT;
>> +
>> + if (!realm_is_addr_protected(realm, ipa))
>> + return realm_map_non_secure(realm, ipa, page, map_size,
>> + memcache);
>> +
>> + return realm_map_protected(realm, ipa, page, map_size, memcache);
>> +}
>> +
>> +static int private_memslot_fault(struct kvm_vcpu *vcpu,
>> + phys_addr_t fault_ipa,
>> + struct kvm_memory_slot *memslot)
>> +{
>> + struct kvm *kvm = vcpu->kvm;
>> + gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
>> + gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
>> + bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
>> + bool priv_exists = kvm_mem_is_private(kvm, gfn);
>> + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>> + kvm_pfn_t pfn;
>> + int ret;
>> +
>> + if (priv_exists != is_priv_gfn) {
>> + kvm_prepare_memory_fault_exit(vcpu,
>> + fault_ipa & ~gpa_stolen_mask,
>> + PAGE_SIZE,
>> + kvm_is_write_fault(vcpu),
>> + false, is_priv_gfn);
>> +
>> + return 0;
>> + }
>> +
>> + if (!is_priv_gfn) {
>> + /* Not a private mapping, handling normally */
>> + return -EAGAIN;
>> + }
>>
>
> Instead of that EAGAIN, it better to handle as below?
I'm not finding the below easier to read.
> arch/arm64/kvm/mmu.c | 24 ++++++++++++++++--------
> 1 file changed, 16 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 1eddbc7d7156..33ef95b5c94a 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1480,11 +1480,6 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> - if (!is_priv_gfn) {
> - /* Not a private mapping, handling normally */
> - return -EAGAIN;
> - }
> -
> ret = kvm_mmu_topup_memory_cache(memcache,
> kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
> if (ret)
> @@ -1925,12 +1920,25 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
> memslot = gfn_to_memslot(vcpu->kvm, gfn);
>
> - if (kvm_slot_can_be_private(memslot)) {
> - ret = private_memslot_fault(vcpu, fault_ipa, memslot);
> - if (ret != -EAGAIN)
> + if (kvm_slot_can_be_private(memslot) &&
> + kvm_is_private_gpa(vcpu->kvm, ipa)) {
I presume kvm_is_private_gpa() is defined as something like:
static bool kvm_is_private_gpa(kvm, phys_addr_t ipa)
{
gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
return !((ipa & gpa_stolen_mask) == gpa_stolen_mask);
}
> + ret = private_memslot_fault(vcpu, ipa, memslot);
So this handles the case in private_memslot_fault() where is_priv_gfn is
true. So there's a little bit of simplification in
private_memslot_fault().
> goto out;
> }
> + /* attribute msimatch. shared access fault on a mem with private attribute */
> + if (kvm_mem_is_private(vcpu->kvm, gfn)) {
> + /* let VMM fixup the memory attribute */
> + kvm_prepare_memory_fault_exit(vcpu,
> + kvm_gpa_from_fault(vcpu->kvm, ipa),
> + PAGE_SIZE,
> + kvm_is_write_fault(vcpu),
> + false, false);
And then we have to duplicate the code here for calling
kvm_prepare_memory_fault_exit(). Which seems a bit ugly to me. Am I
missing something? Your patch doesn't seem complete.
> +
> + ret = 0;
> + goto out;
> + }
>
> + /* Slot can be be private, but fault addr is not, handle that as normal fault */
> hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
> write_fault = kvm_is_write_fault(vcpu);
> if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
Note your email had a signature line "--" here which causes my email
client to remove the rest of your reply - it's worth dropping that from
the git output when sending diffs. I've attempted to include your
second diff below manually.
> Instead of referring this as stolen bits is it better to do
>
> arch/arm64/include/asm/kvm_emulate.h | 20 +++++++++++++++++---
> arch/arm64/kvm/mmu.c | 21 ++++++++-------------
> 2 files changed, 25 insertions(+), 16 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 0b50572d3719..790412fd53b8 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -710,14 +710,28 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
> return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
> }
>
> -static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_addr)
> {
> + gpa_t addr_mask;
> +
> if (kvm_is_realm(kvm)) {
> struct realm *realm = &kvm->arch.realm;
>
> - return BIT(realm->ia_bits - 1);
> + addr_mask = BIT(realm->ia_bits - 1);
> + /* clear shared bit and return */
> + return fault_addr & ~addr_mask;
> }
> - return 0;
> + return fault_addr;
> +}
> +
> +static inline bool kvm_is_private_gpa(struct kvm *kvm, phys_addr_t fault_addr)
> +{
> + /*
> + * For Realms, the shared address is an alias of the private GPA
> + * with top bit set and we have a single address space. Thus if the
> + * fault address matches the GPA, it is the private GPA
> + */
> + return fault_addr == kvm_gpa_from_fault(kvm, fault_addr);
> }
Ah, so here's the missing function from above.
>
> static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index eb8b8d013f3e..1eddbc7d7156 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1464,20 +1464,18 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
> struct kvm_memory_slot *memslot)
> {
> struct kvm *kvm = vcpu->kvm;
> - gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
> - gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> - bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
> - bool priv_exists = kvm_mem_is_private(kvm, gfn);
> + gfn_t gfn = kvm_gpa_from_fault(kvm, fault_ipa) >> PAGE_SHIFT;
> struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
> kvm_pfn_t pfn;
> int ret;
>
> - if (priv_exists != is_priv_gfn) {
> + if (!kvm_mem_is_private(kvm, gfn)) {
> + /* let VMM fixup the memory attribute */
> kvm_prepare_memory_fault_exit(vcpu,
> - fault_ipa & ~gpa_stolen_mask,
> + kvm_gpa_from_fault(kvm, fault_ipa),
> PAGE_SIZE,
> kvm_is_write_fault(vcpu),
> - false, is_priv_gfn);
> + false, true);
>
> return 0;
> }
> @@ -1527,7 +1525,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> long vma_pagesize, fault_granule;
> enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> struct kvm_pgtable *pgt;
> - gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>
> if (fault_is_perm)
> fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> @@ -1640,7 +1637,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
> fault_ipa &= ~(vma_pagesize - 1);
>
> - gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> + gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
> mte_allowed = kvm_vma_mte_allowed(vma);
>
> vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> @@ -1835,7 +1832,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> struct kvm_memory_slot *memslot;
> unsigned long hva;
> bool is_iabt, write_fault, writable;
> - gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
> gfn_t gfn;
> int ret, idx;
>
> @@ -1926,7 +1922,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> nested = &nested_trans;
> }
>
> - gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> + gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
> memslot = gfn_to_memslot(vcpu->kvm, gfn);
>
> if (kvm_slot_can_be_private(memslot)) {
> @@ -1978,8 +1974,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> * of the page size.
> */
> ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
> - ipa &= ~gpa_stolen_mask;
> - ret = io_mem_abort(vcpu, ipa);
> + ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
> goto out_unlock;
> }
I can see your point that kvm_gpa_from_fault() makes sense. I'm still
not convinced about the duplication of the kvm_prepare_memory_fault_exit()
call though.
How about the following (untested):
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 0b50572d3719..fa03520d7933 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -710,14 +710,14 @@ static inline bool kvm_realm_is_created(struct kvm *kvm)
return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE;
}
-static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
+static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t fault_ipa)
{
if (kvm_is_realm(kvm)) {
struct realm *realm = &kvm->arch.realm;
- return BIT(realm->ia_bits - 1);
+ return fault_ipa & ~BIT(realm->ia_bits - 1);
}
- return 0;
+ return fault_ipa;
}
static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d7e8b0c4f2a3..c0a3054201a9 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1468,9 +1468,9 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
struct kvm_memory_slot *memslot)
{
struct kvm *kvm = vcpu->kvm;
- gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(kvm);
- gfn_t gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
- bool is_priv_gfn = !((fault_ipa & gpa_stolen_mask) == gpa_stolen_mask);
+ gpa_t gpa = kvm_gpa_from_fault(kvm, fault_ipa);
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ bool is_priv_gfn = (gpa == fault_ipa);
bool priv_exists = kvm_mem_is_private(kvm, gfn);
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
kvm_pfn_t pfn;
@@ -1478,7 +1478,7 @@ static int private_memslot_fault(struct kvm_vcpu *vcpu,
if (priv_exists != is_priv_gfn) {
kvm_prepare_memory_fault_exit(vcpu,
- fault_ipa & ~gpa_stolen_mask,
+ gpa,
PAGE_SIZE,
kvm_is_write_fault(vcpu),
false, is_priv_gfn);
@@ -1531,7 +1531,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
- gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
if (fault_is_perm)
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
@@ -1648,7 +1647,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
fault_ipa &= ~(vma_pagesize - 1);
- gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
+ gfn = kvm_gpa_from_fault(kvm, ipa) >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1843,7 +1842,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
- gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
gfn_t gfn;
int ret, idx;
@@ -1934,7 +1932,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
nested = &nested_trans;
}
- gfn = (ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
+ gfn = kvm_gpa_from_fault(vcpu->kvm, ipa) >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
if (kvm_slot_can_be_private(memslot)) {
@@ -1986,8 +1984,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* of the page size.
*/
ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
- ipa &= ~gpa_stolen_mask;
- ret = io_mem_abort(vcpu, ipa);
+ ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, ipa));
goto out_unlock;
}
Thanks,
Steve
Powered by blists - more mailing lists