[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4a74aafe-9613-4bf2-47a1-cad0aad34323@amd.com>
Date: Wed, 30 Sep 2020 18:20:50 -0500
From: Eric van Tassell <evantass@....com>
To: Ben Gardon <bgardon@...gle.com>, linux-kernel@...r.kernel.org,
kvm@...r.kernel.org
Cc: Cannon Matthews <cannonmatthews@...gle.com>,
Paolo Bonzini <pbonzini@...hat.com>,
Peter Xu <peterx@...hat.com>,
Sean Christopherson <sean.j.christopherson@...el.com>,
Peter Shier <pshier@...gle.com>,
Peter Feiner <pfeiner@...gle.com>,
Junaid Shahid <junaids@...gle.com>,
Jim Mattson <jmattson@...gle.com>,
Yulei Zhang <yulei.kernel@...il.com>,
Wanpeng Li <kernellwp@...il.com>,
Vitaly Kuznetsov <vkuznets@...hat.com>,
Xiao Guangrong <xiaoguangrong.eric@...il.com>
Subject: Re: [PATCH 02/22] kvm: mmu: Introduce tdp_iter
On 9/25/20 4:22 PM, Ben Gardon wrote:
> The TDP iterator implements a pre-order traversal of a TDP paging
> structure. This iterator will be used in future patches to create
> an efficient implementation of the KVM MMU for the TDP case.
>
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
>
> This series can be viewed in Gerrit at:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flinux-review.googlesource.com%2Fc%2Fvirt%2Fkvm%2Fkvm%2F%2B%2F2538&data=02%7C01%7CEric.VanTassell%40amd.com%7C08ae391c1b9a4da5eb1e08d861997899%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637366659084240984&sdata=pVD8B%2BoNGf1fN18y%2Bjrwyuhlu7TbP1DhUIg%2BbP8KP2s%3D&reserved=0
>
> Signed-off-by: Ben Gardon <bgardon@...gle.com>
> ---
> arch/x86/kvm/Makefile | 3 +-
> arch/x86/kvm/mmu/mmu.c | 19 +---
> arch/x86/kvm/mmu/mmu_internal.h | 15 +++
> arch/x86/kvm/mmu/tdp_iter.c | 163 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/mmu/tdp_iter.h | 53 +++++++++++
> 5 files changed, 237 insertions(+), 16 deletions(-)
> create mode 100644 arch/x86/kvm/mmu/tdp_iter.c
> create mode 100644 arch/x86/kvm/mmu/tdp_iter.h
>
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 4a3081e9f4b5d..cf6a9947955f7 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
>
> kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
> i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
> - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
> + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
> + mmu/tdp_iter.o
>
> kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
> kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 81240b558d67f..b48b00c8cde65 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -134,15 +134,6 @@ module_param(dbg, bool, 0644);
> #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
> #define SPTE_MMIO_MASK (3ULL << 52)
>
> -#define PT64_LEVEL_BITS 9
> -
> -#define PT64_LEVEL_SHIFT(level) \
> - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> -
> -#define PT64_INDEX(address, level)\
> - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> -
> -
> #define PT32_LEVEL_BITS 10
>
> #define PT32_LEVEL_SHIFT(level) \
> @@ -192,8 +183,6 @@ module_param(dbg, bool, 0644);
> #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>
> -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> -
> /* make pte_list_desc fit well in cache line */
> #define PTE_LIST_EXT 3
>
> @@ -346,7 +335,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
> }
> EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>
> -static bool is_mmio_spte(u64 spte)
> +bool is_mmio_spte(u64 spte)
> {
> return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
> }
> @@ -623,7 +612,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
> return vcpu->arch.efer & EFER_NX;
> }
>
> -static int is_shadow_present_pte(u64 pte)
> +int is_shadow_present_pte(u64 pte)
> {
> return (pte != 0) && !is_mmio_spte(pte);
From <Figure 28-1: Formats of EPTP and EPT Paging-Structure Entries" of
the manual I don't have at my fingertips right now, I believe you should
only check the low 3 bits(mask = 0x7). Since the upper bits are ignored,
might that not mean they're not guaranteed to be 0?
> }
> @@ -633,7 +622,7 @@ static int is_large_pte(u64 pte)
> return pte & PT_PAGE_SIZE_MASK;
> }
>
> -static int is_last_spte(u64 pte, int level)
> +int is_last_spte(u64 pte, int level)
> {
> if (level == PG_LEVEL_4K)
> return 1;
> @@ -647,7 +636,7 @@ static bool is_executable_pte(u64 spte)
> return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
> }
>
> -static kvm_pfn_t spte_to_pfn(u64 pte)
> +kvm_pfn_t spte_to_pfn(u64 pte)
> {
> return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> }
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 3acf3b8eb469d..65bb110847858 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -60,4 +60,19 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
> bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> struct kvm_memory_slot *slot, u64 gfn);
>
> +#define PT64_LEVEL_BITS 9
> +
> +#define PT64_LEVEL_SHIFT(level) \
> + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
> +
> +#define PT64_INDEX(address, level)\
> + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +
> +/* Functions for interpreting SPTEs */
> +kvm_pfn_t spte_to_pfn(u64 pte);
> +bool is_mmio_spte(u64 spte);
> +int is_shadow_present_pte(u64 pte);
> +int is_last_spte(u64 pte, int level);
> +
> #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> new file mode 100644
> index 0000000000000..ee90d62d2a9b1
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -0,0 +1,163 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include "mmu_internal.h"
> +#include "tdp_iter.h"
> +
> +/*
> + * Recalculates the pointer to the SPTE for the current GFN and level and
> + * reread the SPTE.
> + */
> +static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
> +{
> + iter->sptep = iter->pt_path[iter->level - 1] +
> + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> + iter->old_spte = READ_ONCE(*iter->sptep);
> +}
> +
> +/*
> + * Sets a TDP iterator to walk a pre-order traversal of the paging structure
> + * rooted at root_pt, starting with the walk to translate goal_gfn.
> + */
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> + gfn_t goal_gfn)
> +{
> + WARN_ON(root_level < 1);
> + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
> +
> + iter->goal_gfn = goal_gfn;
> + iter->root_level = root_level;
> + iter->level = root_level;
> + iter->pt_path[iter->level - 1] = root_pt;
> +
> + iter->gfn = iter->goal_gfn -
> + (iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
> + tdp_iter_refresh_sptep(iter);
> +
> + iter->valid = true;
> +}
> +
> +/*
> + * Given an SPTE and its level, returns a pointer containing the host virtual
> + * address of the child page table referenced by the SPTE. Returns null if
> + * there is no such entry.
> + */
> +u64 *spte_to_child_pt(u64 spte, int level)
> +{
> + u64 *pt;
> + /* There's no child entry if this entry isn't present */
> + if (!is_shadow_present_pte(spte))
> + return NULL;
> +
> + /* There is no child page table if this is a leaf entry. */
> + if (is_last_spte(spte, level))
> + return NULL;
> +
> + pt = (u64 *)__va(spte_to_pfn(spte) << PAGE_SHIFT);
> + return pt;
> +}
> +
> +/*
> + * Steps down one level in the paging structure towards the goal GFN. Returns
> + * true if the iterator was able to step down a level, false otherwise.
> + */
> +static bool try_step_down(struct tdp_iter *iter)
> +{
> + u64 *child_pt;
> +
> + if (iter->level == PG_LEVEL_4K)
> + return false;
> +
> + /*
> + * Reread the SPTE before stepping down to avoid traversing into page
> + * tables that are no longer linked from this entry.
> + */
> + iter->old_spte = READ_ONCE(*iter->sptep);
> +
> + child_pt = spte_to_child_pt(iter->old_spte, iter->level);
> + if (!child_pt)
> + return false;
> +
> + iter->level--;
> + iter->pt_path[iter->level - 1] = child_pt;
> + iter->gfn = iter->goal_gfn -
> + (iter->goal_gfn % KVM_PAGES_PER_HPAGE(iter->level));
> + tdp_iter_refresh_sptep(iter);
> +
> + return true;
> +}
> +
> +/*
> + * Steps to the next entry in the current page table, at the current page table
> + * level. The next entry could point to a page backing guest memory or another
> + * page table, or it could be non-present. Returns true if the iterator was
> + * able to step to the next entry in the page table, false if the iterator was
> + * already at the end of the current page table.
> + */
> +static bool try_step_side(struct tdp_iter *iter)
> +{
> + /*
> + * Check if the iterator is already at the end of the current page
> + * table.
> + */
> + if (!((iter->gfn + KVM_PAGES_PER_HPAGE(iter->level)) %
> + KVM_PAGES_PER_HPAGE(iter->level + 1)))
> + return false;
> +
> + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
> + iter->goal_gfn = iter->gfn;
> + iter->sptep++;
> + iter->old_spte = READ_ONCE(*iter->sptep);
> +
> + return true;
> +}
> +
> +/*
> + * Tries to traverse back up a level in the paging structure so that the walk
> + * can continue from the next entry in the parent page table. Returns true on a
> + * successful step up, false if already in the root page.
> + */
> +static bool try_step_up(struct tdp_iter *iter)
> +{
> + if (iter->level == iter->root_level)
> + return false;
> +
> + iter->level++;
> + iter->gfn = iter->gfn - (iter->gfn % KVM_PAGES_PER_HPAGE(iter->level));
> + tdp_iter_refresh_sptep(iter);
> +
> + return true;
> +}
> +
> +/*
> + * Step to the next SPTE in a pre-order traversal of the paging structure.
> + * To get to the next SPTE, the iterator either steps down towards the goal
> + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
> + * highter GFN.
> + *
> + * The basic algorithm is as follows:
> + * 1. If the current SPTE is a non-last-level SPTE, step down into the page
> + * table it points to.
> + * 2. If the iterator cannot step down, it will try to step to the next SPTE
> + * in the current page of the paging structure.
> + * 3. If the iterator cannot step to the next entry in the current page, it will
> + * try to step up to the parent paging structure page. In this case, that
> + * SPTE will have already been visited, and so the iterator must also step
> + * to the side again.
> + */
> +void tdp_iter_next(struct tdp_iter *iter)
> +{
> + bool done;
> +
> + done = try_step_down(iter);
> + if (done)
> + return;
> +
> + done = try_step_side(iter);
> + while (!done) {
> + if (!try_step_up(iter)) {
> + iter->valid = false;
> + break;
> + }
> + done = try_step_side(iter);
> + }
> +}
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> new file mode 100644
> index 0000000000000..b102109778eac
> --- /dev/null
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __KVM_X86_MMU_TDP_ITER_H
> +#define __KVM_X86_MMU_TDP_ITER_H
> +
> +#include <linux/kvm_host.h>
> +
> +#include "mmu.h"
> +
> +/*
> + * A TDP iterator performs a pre-order walk over a TDP paging structure.
> + */
> +struct tdp_iter {
> + /*
> + * The iterator will traverse the paging structure towards the mapping
> + * for this GFN.
> + */
> + gfn_t goal_gfn;
> + /* Pointers to the page tables traversed to reach the current SPTE */
> + u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> + /* A pointer to the current SPTE */
> + u64 *sptep;
> + /* The lowest GFN mapped by the current SPTE */
> + gfn_t gfn;
> + /* The level of the root page given to the iterator */
> + int root_level;
> + /* The iterator's current level within the paging structure */
> + int level;
> + /* A snapshot of the value at sptep */
> + u64 old_spte;
> + /*
> + * Whether the iterator has a valid state. This will be false if the
> + * iterator walks off the end of the paging structure.
> + */
> + bool valid;
> +};
> +
> +/*
> + * Iterates over every SPTE mapping the GFN range [start, end) in a
> + * preorder traversal.
> + */
> +#define for_each_tdp_pte(iter, root, root_level, start, end) \
> + for (tdp_iter_start(&iter, root, root_level, start); \
> + iter.valid && iter.gfn < end; \
> + tdp_iter_next(&iter))
> +
> +u64 *spte_to_child_pt(u64 pte, int level);
> +
> +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> + gfn_t goal_gfn);
> +void tdp_iter_next(struct tdp_iter *iter);
> +
> +#endif /* __KVM_X86_MMU_TDP_ITER_H */
>
Powered by blists - more mailing lists