linux-kernel - Re: [PATCH v2 04/18] KVM: arm64: Move host page ownership tracking to the hyp vmemmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CA+EHjTxiDiZ4P2AVSDErUfr612Ov0UZira+9qQkRQJK_Ki7uug@mail.gmail.com>
Date: Tue, 10 Dec 2024 15:46:57 +0000
From: Fuad Tabba <tabba@...gle.com>
To: Quentin Perret <qperret@...gle.com>
Cc: Marc Zyngier <maz@...nel.org>, Oliver Upton <oliver.upton@...ux.dev>, 
	Joey Gouly <joey.gouly@....com>, Suzuki K Poulose <suzuki.poulose@....com>, 
	Zenghui Yu <yuzenghui@...wei.com>, Catalin Marinas <catalin.marinas@....com>, 
	Will Deacon <will@...nel.org>, Vincent Donnefort <vdonnefort@...gle.com>, 
	Sebastian Ene <sebastianene@...gle.com>, linux-arm-kernel@...ts.infradead.org, 
	kvmarm@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 04/18] KVM: arm64: Move host page ownership tracking to
 the hyp vmemmap

Hi Quentin,

On Tue, 10 Dec 2024 at 15:29, Quentin Perret <qperret@...gle.com> wrote:
>
> Hey Fuad,
>
> On Tuesday 10 Dec 2024 at 13:02:45 (+0000), Fuad Tabba wrote:
> > Hi Quentin,
> >
> > On Tue, 3 Dec 2024 at 10:37, Quentin Perret <qperret@...gle.com> wrote:
> > >
> > > We currently store part of the page-tracking state in PTE software bits
> > > for the host, guests and the hypervisor. This is sub-optimal when e.g.
> > > sharing pages as this forces to break block mappings purely to support
> > > this software tracking. This causes an unnecessarily fragmented stage-2
> > > page-table for the host in particular when it shares pages with Secure,
> > > which can lead to measurable regressions. Moreover, having this state
> > > stored in the page-table forces us to do multiple costly walks on the
> > > page transition path, hence causing overhead.
> > >
> > > In order to work around these problems, move the host-side page-tracking
> > > logic from SW bits in its stage-2 PTEs to the hypervisor's vmemmap.
> > >
> > > Signed-off-by: Quentin Perret <qperret@...gle.com>
> > > ---
> > >  arch/arm64/kvm/hyp/include/nvhe/memory.h |  6 +-
> > >  arch/arm64/kvm/hyp/nvhe/mem_protect.c    | 94 ++++++++++++++++--------
> > >  arch/arm64/kvm/hyp/nvhe/setup.c          |  7 +-
> > >  3 files changed, 71 insertions(+), 36 deletions(-)
> > >
> > > diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
> > > index 88cb8ff9e769..08f3a0416d4c 100644
> > > --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
> > > +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
> > > @@ -8,7 +8,7 @@
> > >  #include <linux/types.h>
> > >
> > >  /*
> > > - * SW bits 0-1 are reserved to track the memory ownership state of each page:
> > > + * Bits 0-1 are reserved to track the memory ownership state of each page:
> > >   *   00: The page is owned exclusively by the page-table owner.
> > >   *   01: The page is owned by the page-table owner, but is shared
> > >   *       with another entity.
> >
> > Not shown in this patch, but a couple of lines below, you might want
> > to update the comment on PKVM_NOPAGE to fix the reference to "PTE's SW
> > bits":
>
> I actually think the comment is still correct -- PKVM_NOPAGE never goes
> in the software bits, with or without this patch, so I figured we could
> leave it as-is. But happy to reword if you have a good idea :)

I see, no, that's fine.

> > > /* Meta-states which aren't encoded directly in the PTE's SW bits */
> > > PKVM_NOPAGE = BIT(2),
> >
> > > @@ -44,7 +44,9 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
> > >  struct hyp_page {
> > >         u16 refcount;
> > >         u8 order;
> > > -       u8 reserved;
> > > +
> > > +       /* Host (non-meta) state. Guarded by the host stage-2 lock. */
> > > +       enum pkvm_page_state host_state : 8;
> > >  };
> > >
> > >  extern u64 __hyp_vmemmap;
> > > diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > > index caba3e4bd09e..1595081c4f6b 100644
> > > --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > > +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
> > > @@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc)
> > >
> > >         memset(addr, 0, PAGE_SIZE);
> > >         p = hyp_virt_to_page(addr);
> > > -       memset(p, 0, sizeof(*p));
> > >         p->refcount = 1;
> > > +       p->order = 0;
> > >
> > >         return addr;
> > >  }
> > > @@ -268,6 +268,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
> > >
> > >  void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
> > >  {
> > > +       struct hyp_page *page;
> > >         void *addr;
> > >
> > >         /* Dump all pgtable pages in the hyp_pool */
> > > @@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
> > >         /* Drain the hyp_pool into the memcache */
> > >         addr = hyp_alloc_pages(&vm->pool, 0);
> > >         while (addr) {
> > > -               memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
> > > +               page = hyp_virt_to_page(addr);
> > > +               page->refcount = 0;
> > > +               page->order = 0;
> > >                 push_hyp_memcache(mc, addr, hyp_virt_to_phys);
> > >                 WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
> > >                 addr = hyp_alloc_pages(&vm->pool, 0);
> > > @@ -382,19 +385,25 @@ bool addr_is_memory(phys_addr_t phys)
> > >         return !!find_mem_range(phys, &range);
> > >  }
> > >
> > > -static bool addr_is_allowed_memory(phys_addr_t phys)
> > > +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
> > > +{
> > > +       return range->start <= addr && addr < range->end;
> > > +}
> > > +
> > > +static int range_is_allowed_memory(u64 start, u64 end)
> >
> > The name of this function "range_*is*_..." implies that it returns a
> > boolean, which other functions in this file (and patch) with similar
> > names do, but it returns an error instead. Maybe
> > check_range_allowed_memory)?
>
> Ack, I'll rename in v3.
>
> > >  {
> > >         struct memblock_region *reg;
> > >         struct kvm_mem_range range;
> > >
> > > -       reg = find_mem_range(phys, &range);
> > > +       /* Can't check the state of both MMIO and memory regions at once */
> >
> > I don't understand this comment in relation to the code. Could you
> > explain it to me please?
>
> find_mem_range() will iterate the list of memblocks to find the 'range'
> in which @start falls. That might either be in a memblock (so @addr is
> memory, and @reg != NULL) or outside of one (so @addr is mmio, and
> @reg == NULL). The check right after ensures that @end is in the same
> PA range as @start. IOW, this checks that [start, end[ doesn't overlap
> memory and MMIO, because the following logic wouldn't work for a mixed
> case like that.

I understand now. I think it might be worth elaborating a bit on the
comment to clarify that. What is confusing to me was that the comment
refers to checking state, but it's in a function that does not care
about page state, i.e., it's not immediately obvious that its primary
callers/users are __host_check_page_state_range(), and other functions
that check the state of a range.

> > > +       reg = find_mem_range(start, &range);
> > > +       if (!is_in_mem_range(end - 1, &range))
> > > +               return -EINVAL;
> > >
> > > -       return reg && !(reg->flags & MEMBLOCK_NOMAP);
> > > -}
> > > +       if (!reg || reg->flags & MEMBLOCK_NOMAP)
> > > +               return -EPERM;
> > >
> > > -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
> > > -{
> > > -       return range->start <= addr && addr < range->end;
> > > +       return 0;
> > >  }
> > >
> > >  static bool range_is_memory(u64 start, u64 end)
> > > @@ -454,8 +463,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
> > >         if (kvm_pte_valid(pte))
> > >                 return -EAGAIN;
> > >
> > > -       if (pte)
> > > +       if (pte) {
> > > +               WARN_ON(addr_is_memory(addr) &&
> > > +                       !(hyp_phys_to_page(addr)->host_state & PKVM_NOPAGE));
> >
> > nit: since the host state is now an enum, should this just be an
> > equality check rather than an &? This makes it consistent with other
> > checks of pkvm_page_state in this patch too.
>
> We don't currently have a state that is additive to PKVM_NOPAGE, so no
> objection from me.
>
> > >                 return -EPERM;
> > > +       }
> > >
> > >         do {
> > >                 u64 granule = kvm_granule_size(level);
> > > @@ -477,10 +489,29 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
> > >         return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
> > >  }
> > >
> > > +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
> > > +{
> > > +       phys_addr_t end = addr + size;
> >
> > nit: newline
> >
> > > +       for (; addr < end; addr += PAGE_SIZE)
> > > +               hyp_phys_to_page(addr)->host_state = state;
> > > +}
> > > +
> > >  int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
> > >  {
> > > -       return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
> > > -                              addr, size, &host_s2_pool, owner_id);
> > > +       int ret;
> > > +
> > > +       ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
> > > +                             addr, size, &host_s2_pool, owner_id);
> > > +       if (ret || !addr_is_memory(addr))
> > > +               return ret;
> >
> > Can hyp set an owner for an address that isn't memory? Trying to
> > understand why we need to update the host stage2 pagetable but not the
> > hypervisor's vmemmap in that case.
>
> I think the answer is not currently, but we will when we'll have to e.g.
> donate IOMMU registers to EL2 and things of that nature. Note that this
> does require an extension to __host_check_page_state_range() to go query
> the page-table 'the old way' for MMIO addresses, though that isn't done
> in this series. If you think strongly that this is confusing, I'm happy
> to drop that check and we'll add it back with the IOMMU series or
> something like that.

I think it's worth a comment if you're not dropping the check..

>
> > > +
> > > +       /* Don't forget to update the vmemmap tracking for the host */
> > > +       if (owner_id == PKVM_ID_HOST)
> > > +               __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
> > > +       else
> > > +               __host_update_page_state(addr, size, PKVM_NOPAGE);
> > > +
> > > +       return 0;
> > >  }
> > >
> > >  static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
> > > @@ -604,35 +635,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
> > >         return kvm_pgtable_walk(pgt, addr, size, &walker);
> > >  }
> > >
> > > -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr)
> > > -{
> > > -       if (!addr_is_allowed_memory(addr))
> > > -               return PKVM_NOPAGE;
> > > -
> > > -       if (!kvm_pte_valid(pte) && pte)
> > > -               return PKVM_NOPAGE;
> > > -
> > > -       return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
> > > -}
> > > -
> > >  static int __host_check_page_state_range(u64 addr, u64 size,
> > >                                          enum pkvm_page_state state)
> > >  {
> > > -       struct check_walk_data d = {
> > > -               .desired        = state,
> > > -               .get_page_state = host_get_page_state,
> > > -       };
> > > +       u64 end = addr + size;
> > > +       int ret;
> > > +
> > > +       ret = range_is_allowed_memory(addr, end);
> > > +       if (ret)
> > > +               return ret;
> > >
> > >         hyp_assert_lock_held(&host_mmu.lock);
> > > -       return check_page_state_range(&host_mmu.pgt, addr, size, &d);
> > > +       for (; addr < end; addr += PAGE_SIZE) {
> > > +               if (hyp_phys_to_page(addr)->host_state != state)
> > > +                       return -EPERM;
> > > +       }
> > > +
> > > +       return 0;
> > >  }
> > >
> > >  static int __host_set_page_state_range(u64 addr, u64 size,
> > >                                        enum pkvm_page_state state)
> > >  {
> > > -       enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
> > > +       if (hyp_phys_to_page(addr)->host_state & PKVM_NOPAGE) {
> >
> > Same nit as above regarding checking for PKVM_NOPAGE
> >
> > Cheers,
> > /fuad
> >
> >
> > > +               int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);
> > >
> > > -       return host_stage2_idmap_locked(addr, size, prot);
> > > +               if (ret)
> > > +                       return ret;
> > > +       }
> > > +
> > > +       __host_update_page_state(addr, size, state);
> > > +
> > > +       return 0;
> > >  }
> > >
> > >  static int host_request_owned_transition(u64 *completer_addr,
> > > diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
> > > index cbdd18cd3f98..7e04d1c2a03d 100644
> > > --- a/arch/arm64/kvm/hyp/nvhe/setup.c
> > > +++ b/arch/arm64/kvm/hyp/nvhe/setup.c
> > > @@ -180,7 +180,6 @@ static void hpool_put_page(void *addr)
> > >  static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > >                                      enum kvm_pgtable_walk_flags visit)
> > >  {
> > > -       enum kvm_pgtable_prot prot;
> > >         enum pkvm_page_state state;
> > >         phys_addr_t phys;
> > >
> > > @@ -203,16 +202,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > >         case PKVM_PAGE_OWNED:
> > >                 return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
> > >         case PKVM_PAGE_SHARED_OWNED:
> > > -               prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
> > > +               hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED;
> > >                 break;
> > >         case PKVM_PAGE_SHARED_BORROWED:
> > > -               prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
> > > +               hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED;
> > >                 break;
> > >         default:
> > >                 return -EINVAL;
> > >         }
> > >
> > > -       return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
> > > +       return 0;
> > >  }
> > >
> > >  static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
> > > --
> > > 2.47.0.338.g60cca15819-goog
> > >