[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3349f838-2c73-4ef0-aa30-a21e41fb39e5@amd.com>
Date: Thu, 21 Nov 2024 11:40:59 -0600
From: Mike Day <michael.day@....com>
To: Elliot Berman <quic_eberman@...cinc.com>,
Paolo Bonzini <pbonzini@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Sean Christopherson <seanjc@...gle.com>, Fuad Tabba <tabba@...gle.com>,
Ackerley Tng <ackerleytng@...gle.com>, Mike Rapoport <rppt@...nel.org>,
David Hildenbrand <david@...hat.com>, "H. Peter Anvin" <hpa@...or.com>,
"Matthew Wilcox (Oracle)" <willy@...radead.org>,
Jonathan Corbet <corbet@....net>, Trond Myklebust <trondmy@...nel.org>,
Anna Schumaker <anna@...nel.org>, Mike Marshall <hubcap@...ibond.com>,
Martin Brandenburg <martin@...ibond.com>,
Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>
Cc: James Gowans <jgowans@...zon.com>, linux-fsdevel@...r.kernel.org,
kvm@...r.kernel.org, linux-coco@...ts.linux.dev,
linux-arm-msm@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, linux-doc@...r.kernel.org, linux-nfs@...r.kernel.org,
devel@...ts.orangefs.org, linux-arm-kernel@...ts.infradead.org
Subject: Re: [PATCH v4 2/2] mm: guestmem: Convert address_space operations to
guestmem library
On 11/21/24 10:43, Elliot Berman wrote:
> On Wed, Nov 20, 2024 at 10:12:08AM -0800, Elliot Berman wrote:
>> diff --git a/mm/guestmem.c b/mm/guestmem.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..19dd7e5d498f07577ec5cec5b52055f7435980f4
>> --- /dev/null
>> +++ b/mm/guestmem.c
>> @@ -0,0 +1,196 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * guestmem library
>> + *
>> + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
>> + */
>> +
>> +#include <linux/fs.h>
>> +#include <linux/guestmem.h>
>> +#include <linux/mm.h>
>> +#include <linux/pagemap.h>
>> +
>> +struct guestmem {
>> + const struct guestmem_ops *ops;
>> +};
>> +
>> +static inline struct guestmem *folio_to_guestmem(struct folio *folio)
>> +{
>> + struct address_space *mapping = folio->mapping;
>> +
>> + return mapping->i_private_data;
>> +}
>> +
>> +static inline bool __guestmem_release_folio(struct address_space *mapping,
>> + struct folio *folio)
>> +{
>> + struct guestmem *gmem = mapping->i_private_data;
>> + struct list_head *entry;
>> +
>> + if (gmem->ops->release_folio) {
>> + list_for_each(entry, &mapping->i_private_list) {
>> + if (!gmem->ops->release_folio(entry, folio))
>> + return false;
>> + }
>> + }
>> +
>> + return true;
>> +}
>> +
>> +static inline int
>> +__guestmem_invalidate_begin(struct address_space *const mapping, pgoff_t start,
>> + pgoff_t end)
>> +{
>> + struct guestmem *gmem = mapping->i_private_data;
>> + struct list_head *entry;
>> + int ret = 0;
>> +
>> + list_for_each(entry, &mapping->i_private_list) {
>> + ret = gmem->ops->invalidate_begin(entry, start, end);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static inline void
>> +__guestmem_invalidate_end(struct address_space *const mapping, pgoff_t start,
>> + pgoff_t end)
>> +{
>> + struct guestmem *gmem = mapping->i_private_data;
>> + struct list_head *entry;
>> +
>> + if (gmem->ops->invalidate_end) {
>> + list_for_each(entry, &mapping->i_private_list)
>> + gmem->ops->invalidate_end(entry, start, end);
>> + }
>> +}
>> +
>> +static void guestmem_free_folio(struct address_space *mapping,
>> + struct folio *folio)
>> +{
>> + WARN_ON_ONCE(!__guestmem_release_folio(mapping, folio));
>> +}
>> +
>> +static int guestmem_error_folio(struct address_space *mapping,
>> + struct folio *folio)
>> +{
>> + pgoff_t start, end;
>> + int ret;
>> +
>> + filemap_invalidate_lock_shared(mapping);
>> +
>> + start = folio->index;
>> + end = start + folio_nr_pages(folio);
>> +
>> + ret = __guestmem_invalidate_begin(mapping, start, end);
>> + if (ret)
>> + goto out;
>> +
>> + /*
>> + * Do not truncate the range, what action is taken in response to the
>> + * error is userspace's decision (assuming the architecture supports
>> + * gracefully handling memory errors). If/when the guest attempts to
>> + * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
>> + * at which point KVM can either terminate the VM or propagate the
>> + * error to userspace.
>> + */
>> +
>> + __guestmem_invalidate_end(mapping, start, end);
>> +
>> +out:
>> + filemap_invalidate_unlock_shared(mapping);
>> + return ret ? MF_DELAYED : MF_FAILED;
>> +}
>> +
>> +static int guestmem_migrate_folio(struct address_space *mapping,
>> + struct folio *dst, struct folio *src,
>> + enum migrate_mode mode)
>> +{
>> + WARN_ON_ONCE(1);
>> + return -EINVAL;
>> +}
>> +
>> +static const struct address_space_operations guestmem_aops = {
>> + .dirty_folio = noop_dirty_folio,
>> + .free_folio = guestmem_free_folio,
>> + .error_remove_folio = guestmem_error_folio,
>> + .migrate_folio = guestmem_migrate_folio,
>> +};
>> +
>> +int guestmem_attach_mapping(struct address_space *mapping,
>> + const struct guestmem_ops *const ops,
>> + struct list_head *data)
>> +{
>> + struct guestmem *gmem;
>> +
>> + if (mapping->a_ops == &guestmem_aops) {
>> + gmem = mapping->i_private_data;
>> + if (gmem->ops != ops)
>> + return -EINVAL;
>> +
>> + goto add;
>> + }
>> +
>> + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
>> + if (!gmem)
>> + return -ENOMEM;
>> +
>> + gmem->ops = ops;
>> +
>> + mapping->a_ops = &guestmem_aops;
>> + mapping->i_private_data = gmem;
>> +
>> + mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
>> + mapping_set_inaccessible(mapping);
>> + /* Unmovable mappings are supposed to be marked unevictable as well. */
>> + WARN_ON_ONCE(!mapping_unevictable(mapping));
>> +
>> +add:
>> + list_add(data, &mapping->i_private_list);
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(guestmem_attach_mapping);
>> +
>> +void guestmem_detach_mapping(struct address_space *mapping,
>> + struct list_head *data)
>> +{
>> + list_del(data);
>> +
>> + if (list_empty(&mapping->i_private_list)) {
>> + kfree(mapping->i_private_data);
>
> Mike was helping me test this out for SEV-SNP. They helped find a bug
> here. Right now, when the file closes, KVM calls
> guestmem_detach_mapping() which will uninstall the ops. When that
> happens, it's not necessary that all of the folios aren't removed from
> the filemap yet and so our free_folio() callback isn't invoked. This
> means that we skip updating the RMP entry back to shared/KVM-owned.
>
> There are a few approaches I could take:
>
> 1. Create a guestmem superblock so I can register guestmem-specific
> destroy_inode() to do the kfree() above. This requires a lot of
> boilerplate code, and I think it's not preferred approach.
> 2. Update how KVM tracks the memory so it is back in "shared" state when
> the file closes. This requires some significant rework about the page
> state compared to current guest_memfd. That rework might be useful
> for the shared/private state machine.
> 3. Call truncate_inode_pages(mapping, 0) to force pages to be freed
> here. It's might be possible that a page is allocated after this
> point. In order for that to be a problem, KVM would need to update
> RMP entry as guest-owned, and I don't believe that's possible after
> the last guestmem_detach_mapping().
>
> My preference is to go with #3 as it was the most easy thing to do.
#3 is my preference as well. The semantics are that the guest is "closing" the gmem
object, which means all the memory is being released from the guest.
Mike
>
>> + mapping->i_private_data = NULL;
>> + mapping->a_ops = &empty_aops;
>> + }
>> +}
>> +EXPORT_SYMBOL_GPL(guestmem_detach_mapping);
>> +
>> +struct folio *guestmem_grab_folio(struct address_space *mapping, pgoff_t index)
>> +{
>> + /* TODO: Support huge pages. */
>> + return filemap_grab_folio(mapping, index);
>> +}
>> +EXPORT_SYMBOL_GPL(guestmem_grab_folio);
>> +
>> +int guestmem_punch_hole(struct address_space *mapping, loff_t offset,
>> + loff_t len)
>> +{
>> + pgoff_t start = offset >> PAGE_SHIFT;
>> + pgoff_t end = (offset + len) >> PAGE_SHIFT;
>> + int ret;
>> +
>> + filemap_invalidate_lock(mapping);
>> + ret = __guestmem_invalidate_begin(mapping, start, end);
>> + if (ret)
>> + goto out;
>> +
>> + truncate_inode_pages_range(mapping, offset, offset + len - 1);
>> +
>> + __guestmem_invalidate_end(mapping, start, end);
>> +
>> +out:
>> + filemap_invalidate_unlock(mapping);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(guestmem_punch_hole);
>> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
>> index fd6a3010afa833e077623065b80bdbb5b1012250..1339098795d2e859b2ee0ef419b29045aedc8487 100644
>> --- a/virt/kvm/Kconfig
>> +++ b/virt/kvm/Kconfig
>> @@ -106,6 +106,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES
>>
>> config KVM_PRIVATE_MEM
>> select XARRAY_MULTI
>> + select GUESTMEM
>> bool
>>
>> config KVM_GENERIC_PRIVATE_MEM
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 24dcbad0cb76e353509cf4718837a1999f093414..edf57d5662cb8634bbd9ca3118b293c4f7ca229a 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -1,6 +1,7 @@
>> // SPDX-License-Identifier: GPL-2.0
>> #include <linux/backing-dev.h>
>> #include <linux/falloc.h>
>> +#include <linux/guestmem.h>
>> #include <linux/kvm_host.h>
>> #include <linux/pagemap.h>
>> #include <linux/anon_inodes.h>
>> @@ -98,8 +99,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> */
>> static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
>> {
>> - /* TODO: Support huge pages. */
>> - return filemap_grab_folio(inode->i_mapping, index);
>> + return guestmem_grab_folio(inode->i_mapping, index);
>> }
>>
>> static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>> @@ -151,28 +151,7 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
>>
>> static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>> {
>> - struct list_head *gmem_list = &inode->i_mapping->i_private_list;
>> - pgoff_t start = offset >> PAGE_SHIFT;
>> - pgoff_t end = (offset + len) >> PAGE_SHIFT;
>> - struct kvm_gmem *gmem;
>> -
>> - /*
>> - * Bindings must be stable across invalidation to ensure the start+end
>> - * are balanced.
>> - */
>> - filemap_invalidate_lock(inode->i_mapping);
>> -
>> - list_for_each_entry(gmem, gmem_list, entry)
>> - kvm_gmem_invalidate_begin(gmem, start, end);
>> -
>> - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
>> -
>> - list_for_each_entry(gmem, gmem_list, entry)
>> - kvm_gmem_invalidate_end(gmem, start, end);
>> -
>> - filemap_invalidate_unlock(inode->i_mapping);
>> -
>> - return 0;
>> + return guestmem_punch_hole(inode->i_mapping, offset, len);
>> }
>>
>> static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>> @@ -277,7 +256,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
>> kvm_gmem_invalidate_begin(gmem, 0, -1ul);
>> kvm_gmem_invalidate_end(gmem, 0, -1ul);
>>
>> - list_del(&gmem->entry);
>> + guestmem_detach_mapping(inode->i_mapping, &gmem->entry);
>>
>> filemap_invalidate_unlock(inode->i_mapping);
>>
>> @@ -318,63 +297,42 @@ void kvm_gmem_init(struct module *module)
>> kvm_gmem_fops.owner = module;
>> }
>>
>> -static int kvm_gmem_migrate_folio(struct address_space *mapping,
>> - struct folio *dst, struct folio *src,
>> - enum migrate_mode mode)
>> +static int kvm_guestmem_invalidate_begin(struct list_head *entry, pgoff_t start,
>> + pgoff_t end)
>> {
>> - WARN_ON_ONCE(1);
>> - return -EINVAL;
>> -}
>> + struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
>>
>> -static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
>> -{
>> - struct list_head *gmem_list = &mapping->i_private_list;
>> - struct kvm_gmem *gmem;
>> - pgoff_t start, end;
>> -
>> - filemap_invalidate_lock_shared(mapping);
>> -
>> - start = folio->index;
>> - end = start + folio_nr_pages(folio);
>> -
>> - list_for_each_entry(gmem, gmem_list, entry)
>> - kvm_gmem_invalidate_begin(gmem, start, end);
>> -
>> - /*
>> - * Do not truncate the range, what action is taken in response to the
>> - * error is userspace's decision (assuming the architecture supports
>> - * gracefully handling memory errors). If/when the guest attempts to
>> - * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
>> - * at which point KVM can either terminate the VM or propagate the
>> - * error to userspace.
>> - */
>> + kvm_gmem_invalidate_begin(gmem, start, end);
>>
>> - list_for_each_entry(gmem, gmem_list, entry)
>> - kvm_gmem_invalidate_end(gmem, start, end);
>> + return 0;
>> +}
>>
>> - filemap_invalidate_unlock_shared(mapping);
>> +static void kvm_guestmem_invalidate_end(struct list_head *entry, pgoff_t start,
>> + pgoff_t end)
>> +{
>> + struct kvm_gmem *gmem = container_of(entry, struct kvm_gmem, entry);
>>
>> - return MF_DELAYED;
>> + kvm_gmem_invalidate_end(gmem, start, end);
>> }
>>
>> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>> -static void kvm_gmem_free_folio(struct address_space *mapping,
>> - struct folio *folio)
>> +static bool kvm_gmem_release_folio(struct list_head *entry, struct folio *folio)
>> {
>> struct page *page = folio_page(folio, 0);
>> kvm_pfn_t pfn = page_to_pfn(page);
>> int order = folio_order(folio);
>>
>> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>> +
>> + return true;
>> }
>> #endif
>>
>> -static const struct address_space_operations kvm_gmem_aops = {
>> - .dirty_folio = noop_dirty_folio,
>> - .migrate_folio = kvm_gmem_migrate_folio,
>> - .error_remove_folio = kvm_gmem_error_folio,
>> +static const struct guestmem_ops kvm_guestmem_ops = {
>> + .invalidate_begin = kvm_guestmem_invalidate_begin,
>> + .invalidate_end = kvm_guestmem_invalidate_end,
>> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>> - .free_folio = kvm_gmem_free_folio,
>> + .release_folio = kvm_gmem_release_folio,
>> #endif
>> };
>>
>> @@ -430,22 +388,22 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>>
>> inode->i_private = (void *)(unsigned long)flags;
>> inode->i_op = &kvm_gmem_iops;
>> - inode->i_mapping->a_ops = &kvm_gmem_aops;
>> inode->i_mode |= S_IFREG;
>> inode->i_size = size;
>> - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
>> - mapping_set_inaccessible(inode->i_mapping);
>> - /* Unmovable mappings are supposed to be marked unevictable as well. */
>> - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>> + err = guestmem_attach_mapping(inode->i_mapping, &kvm_guestmem_ops,
>> + &gmem->entry);
>> + if (err)
>> + goto err_putfile;
>>
>> kvm_get_kvm(kvm);
>> gmem->kvm = kvm;
>> xa_init(&gmem->bindings);
>> - list_add(&gmem->entry, &inode->i_mapping->i_private_list);
>>
>> fd_install(fd, file);
>> return fd;
>>
>> +err_putfile:
>> + fput(file);
>> err_gmem:
>> kfree(gmem);
>> err_fd:
>>
>> --
>> 2.34.1
>>
Powered by blists - more mailing lists