lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250919082534.17376-1-roypat@amazon.co.uk>
Date: Fri, 19 Sep 2025 08:25:36 +0000
From: "Roy, Patrick" <roypat@...zon.co.uk>
To: "rppt@...nel.org" <rppt@...nel.org>
CC: "Liam.Howlett@...cle.com" <Liam.Howlett@...cle.com>,
	"agordeev@...ux.ibm.com" <agordeev@...ux.ibm.com>,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>, "alex@...ti.fr"
	<alex@...ti.fr>, "andrii@...nel.org" <andrii@...nel.org>, "anna@...nel.org"
	<anna@...nel.org>, "aou@...s.berkeley.edu" <aou@...s.berkeley.edu>,
	"ast@...nel.org" <ast@...nel.org>, "axelrasmussen@...gle.com"
	<axelrasmussen@...gle.com>, "borntraeger@...ux.ibm.com"
	<borntraeger@...ux.ibm.com>, "bp@...en8.de" <bp@...en8.de>,
	"bpf@...r.kernel.org" <bpf@...r.kernel.org>, "brauner@...nel.org"
	<brauner@...nel.org>, "catalin.marinas@....com" <catalin.marinas@....com>,
	"chenhuacai@...nel.org" <chenhuacai@...nel.org>, "corbet@....net"
	<corbet@....net>, "daniel@...earbox.net" <daniel@...earbox.net>,
	"dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
	"david@...hat.com" <david@...hat.com>, "derekmn@...zon.co.uk"
	<derekmn@...zon.co.uk>, "devel@...ts.orangefs.org"
	<devel@...ts.orangefs.org>, "eddyz87@...il.com" <eddyz87@...il.com>,
	"gerald.schaefer@...ux.ibm.com" <gerald.schaefer@...ux.ibm.com>,
	"gor@...ux.ibm.com" <gor@...ux.ibm.com>, "hannes@...xchg.org"
	<hannes@...xchg.org>, "haoluo@...gle.com" <haoluo@...gle.com>,
	"hca@...ux.ibm.com" <hca@...ux.ibm.com>, "hpa@...or.com" <hpa@...or.com>,
	"hubcap@...ibond.com" <hubcap@...ibond.com>, "jack@...e.cz" <jack@...e.cz>,
	"Thomson, Jack" <jackabt@...zon.co.uk>, "jannh@...gle.com"
	<jannh@...gle.com>, "jgg@...pe.ca" <jgg@...pe.ca>, "jhubbard@...dia.com"
	<jhubbard@...dia.com>, "joey.gouly@....com" <joey.gouly@....com>,
	"john.fastabend@...il.com" <john.fastabend@...il.com>, "jolsa@...nel.org"
	<jolsa@...nel.org>, "Kalyazin, Nikita" <kalyazin@...zon.co.uk>,
	"kernel@...0n.name" <kernel@...0n.name>, "kpsingh@...nel.org"
	<kpsingh@...nel.org>, "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
	"kvmarm@...ts.linux.dev" <kvmarm@...ts.linux.dev>,
	"linux-arm-kernel@...ts.infradead.org"
	<linux-arm-kernel@...ts.infradead.org>, "linux-doc@...r.kernel.org"
	<linux-doc@...r.kernel.org>, "linux-fsdevel@...r.kernel.org"
	<linux-fsdevel@...r.kernel.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "linux-kselftest@...r.kernel.org"
	<linux-kselftest@...r.kernel.org>, "linux-mm@...ck.org" <linux-mm@...ck.org>,
	"linux-nfs@...r.kernel.org" <linux-nfs@...r.kernel.org>,
	"linux-riscv@...ts.infradead.org" <linux-riscv@...ts.infradead.org>,
	"linux-s390@...r.kernel.org" <linux-s390@...r.kernel.org>,
	"loongarch@...ts.linux.dev" <loongarch@...ts.linux.dev>,
	"lorenzo.stoakes@...cle.com" <lorenzo.stoakes@...cle.com>, "luto@...nel.org"
	<luto@...nel.org>, "martin.lau@...ux.dev" <martin.lau@...ux.dev>,
	"martin@...ibond.com" <martin@...ibond.com>, "maz@...nel.org"
	<maz@...nel.org>, "mhocko@...e.com" <mhocko@...e.com>, "mingo@...hat.com"
	<mingo@...hat.com>, "oliver.upton@...ux.dev" <oliver.upton@...ux.dev>,
	"palmer@...belt.com" <palmer@...belt.com>, "paul.walmsley@...ive.com"
	<paul.walmsley@...ive.com>, "pbonzini@...hat.com" <pbonzini@...hat.com>,
	"peterx@...hat.com" <peterx@...hat.com>, "peterz@...radead.org"
	<peterz@...radead.org>, "pfalcato@...e.de" <pfalcato@...e.de>, "Roy, Patrick"
	<roypat@...zon.co.uk>, "sdf@...ichev.me" <sdf@...ichev.me>,
	"seanjc@...gle.com" <seanjc@...gle.com>, "shakeel.butt@...ux.dev"
	<shakeel.butt@...ux.dev>, "shuah@...nel.org" <shuah@...nel.org>,
	"song@...nel.org" <song@...nel.org>, "surenb@...gle.com" <surenb@...gle.com>,
	"suzuki.poulose@....com" <suzuki.poulose@....com>, "svens@...ux.ibm.com"
	<svens@...ux.ibm.com>, "tglx@...utronix.de" <tglx@...utronix.de>,
	"trondmy@...nel.org" <trondmy@...nel.org>, "vbabka@...e.cz" <vbabka@...e.cz>,
	"viro@...iv.linux.org.uk" <viro@...iv.linux.org.uk>, "weixugc@...gle.com"
	<weixugc@...gle.com>, "will@...nel.org" <will@...nel.org>,
	"willy@...radead.org" <willy@...radead.org>, "x86@...nel.org"
	<x86@...nel.org>, "Cali, Marco" <xmarcalx@...zon.co.uk>,
	"yonghong.song@...ux.dev" <yonghong.song@...ux.dev>, "yuanchu@...gle.com"
	<yuanchu@...gle.com>, "yuzenghui@...wei.com" <yuzenghui@...wei.com>,
	"zhengqi.arch@...edance.com" <zhengqi.arch@...edance.com>
Subject: Re: [PATCH v6 05/11] KVM: guest_memfd: Add flag to remove from direct
 map

Hi Mike,

...

>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2f2394cce24e..0bfd8e5fd9de 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -19,6 +19,7 @@
>>  #include <linux/maple_tree.h>
>>  #include <linux/percpu.h>
>>  #include <linux/psci.h>
>> +#include <linux/set_memory.h>
>>  #include <asm/arch_gicv3.h>
>>  #include <asm/barrier.h>
>>  #include <asm/cpufeature.h>
>> @@ -1706,5 +1707,16 @@ void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt);
>>  void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1);
>>  void check_feature_map(void);
>>
>> +#ifdef CONFIG_KVM_GUEST_MEMFD
>> +static inline bool kvm_arch_gmem_supports_no_direct_map(void)
>> +{
>> +     /*
>> +      * Without FWB, direct map access is needed in kvm_pgtable_stage2_map(),
>> +      * as it calls dcache_clean_inval_poc().
>> +      */
>> +     return can_set_direct_map() && cpus_have_final_cap(ARM64_HAS_STAGE2_FWB);
>> +}
>> +#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map
>> +#endif /* CONFIG_KVM_GUEST_MEMFD */
>>
>>  #endif /* __ARM64_KVM_HOST_H__ */
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 1d0585616aa3..a9468bce55f2 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -36,6 +36,7 @@
>>  #include <linux/rbtree.h>
>>  #include <linux/xarray.h>
>>  #include <asm/signal.h>
>> +#include <linux/set_memory.h>
> 
> The set_memory APIs are not used in the header, no need to include it here.
> 

Ack!

>>  #include <linux/kvm.h>
>>  #include <linux/kvm_para.h>
>> @@ -731,6 +732,12 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
>>  bool kvm_arch_supports_gmem_mmap(struct kvm *kvm);
>>  #endif
>>
>> +#ifdef CONFIG_KVM_GUEST_MEMFD
>> +#ifndef kvm_arch_gmem_supports_no_direct_map
>> +#define kvm_arch_gmem_supports_no_direct_map can_set_direct_map
>> +#endif
>> +#endif /* CONFIG_KVM_GUEST_MEMFD */
>> +
>>  #ifndef kvm_arch_has_readonly_mem
>>  static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
>>  {
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6efa98a57ec1..33c8e8946019 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -963,6 +963,7 @@ struct kvm_enable_cap {
>>  #define KVM_CAP_RISCV_MP_STATE_RESET 242
>>  #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
>>  #define KVM_CAP_GUEST_MEMFD_MMAP 244
>> +#define KVM_CAP_GUEST_MEMFD_NO_DIRECT_MAP 245
>>
>>  struct kvm_irq_routing_irqchip {
>>       __u32 irqchip;
>> @@ -1600,6 +1601,7 @@ struct kvm_memory_attributes {
>>
>>  #define KVM_CREATE_GUEST_MEMFD       _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
>>  #define GUEST_MEMFD_FLAG_MMAP        (1ULL << 0)
>> +#define GUEST_MEMFD_FLAG_NO_DIRECT_MAP (1ULL << 1)
>>
>>  struct kvm_create_guest_memfd {
>>       __u64 size;
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 81028984ff89..3c64099fc98a 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -4,6 +4,7 @@
>>  #include <linux/kvm_host.h>
>>  #include <linux/pagemap.h>
>>  #include <linux/anon_inodes.h>
>> +#include <linux/set_memory.h>
>>
>>  #include "kvm_mm.h"
>>
>> @@ -42,9 +43,24 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>>       return 0;
>>  }
>>
>> -static inline void kvm_gmem_mark_prepared(struct folio *folio)
>> +static bool kvm_gmem_test_no_direct_map(struct inode *inode)
>>  {
>> -     folio_mark_uptodate(folio);
>> +     return ((unsigned long) inode->i_private) & GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
>> +}
>> +
>> +static inline int kvm_gmem_mark_prepared(struct folio *folio)
>> +{
>> +     struct inode *inode = folio_inode(folio);
>> +     int r = 0;
>> +
>> +     if (kvm_gmem_test_no_direct_map(inode))
>> +             r = set_direct_map_valid_noflush(folio_page(folio, 0), folio_nr_pages(folio),
>> +                                              false);
>> +
>> +     if (!r)
>> +             folio_mark_uptodate(folio);
>> +
>> +     return r;
>>  }
>>
>>  /*
>> @@ -82,7 +98,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>>       index = ALIGN_DOWN(index, 1 << folio_order(folio));
>>       r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
>>       if (!r)
>> -             kvm_gmem_mark_prepared(folio);
>> +             r = kvm_gmem_mark_prepared(folio);
> 
> If this fails, shouldn't we undo __kvm_gmem_prepare_folio()?
>

Yes, good point. I'm not sure if we can undo preparation (its only used
by AMD-SEV right now, for passing off the page to the CoCo context). But
not undoing it means that guest_memfd will consider the page unprepared,
and zero it again the next time it's accesses, which will cause a
machine check because the page has already been passed off to the
confidential world.

We talked about this in the guest_memfd upstream call yesterday, and
decided that in addition to this problem, we want to separate
preparedness tracking from direct map removal state tracking anyway (and
move preparedness tracking outside of guest_memfd into the arch specific
code). And if direct map state and preparedness are separate bits, then
we can accurately record the state of "preparation worked but direct map
removal failed".

>>
>>       return r;
>>  }
>> @@ -344,8 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>>       }
>>
>>       if (!folio_test_uptodate(folio)) {
>> +             int err = 0;
>> +
>>               clear_highpage(folio_page(folio, 0));
>> -             kvm_gmem_mark_prepared(folio);
>> +             err = kvm_gmem_mark_prepared(folio);
>> +
>> +             if (err) {
>> +                     ret = vmf_error(err);
>> +                     goto out_folio;
>> +             }
>>       }
>>
>>       vmf->page = folio_file_page(folio, vmf->pgoff);
>> @@ -436,6 +459,16 @@ static void kvm_gmem_free_folio(struct address_space *mapping,
>>       kvm_pfn_t pfn = page_to_pfn(page);
>>       int order = folio_order(folio);
>>
>> +     /*
>> +      * Direct map restoration cannot fail, as the only error condition
>> +      * for direct map manipulation is failure to allocate page tables
>> +      * when splitting huge pages, but this split would have already
>> +      * happened in set_direct_map_invalid_noflush() in kvm_gmem_mark_prepared().
>> +      * Thus set_direct_map_valid_noflush() here only updates prot bits.
>> +      */
>> +     if (kvm_gmem_test_no_direct_map(mapping->host))
>> +             set_direct_map_valid_noflush(page, folio_nr_pages(folio), true);
>> +
>>       kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>>  }
>>
>> @@ -500,6 +533,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>>       /* Unmovable mappings are supposed to be marked unevictable as well. */
>>       WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>
>> +     if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
>> +             mapping_set_no_direct_map(inode->i_mapping);
>> +
>>       kvm_get_kvm(kvm);
>>       gmem->kvm = kvm;
>>       xa_init(&gmem->bindings);
>> @@ -524,6 +560,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>>       if (kvm_arch_supports_gmem_mmap(kvm))
>>               valid_flags |= GUEST_MEMFD_FLAG_MMAP;
>>
>> +     if (kvm_arch_gmem_supports_no_direct_map())
>> +             valid_flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
>> +
>>       if (flags & ~valid_flags)
>>               return -EINVAL;
>>
>> @@ -768,7 +807,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>>               p = src ? src + i * PAGE_SIZE : NULL;
>>               ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
>>               if (!ret)
>> -                     kvm_gmem_mark_prepared(folio);
>> +                     ret = kvm_gmem_mark_prepared(folio);
>>
>>  put_folio_and_exit:
>>               folio_put(folio);
...

>
> Sincerely yours,
> Mike.
Best, 
Patrick


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ