linux-kernel - Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7555a235-76be-abf5-075a-80dbe6f1ea8e@amd.com>
Date:   Wed, 22 Feb 2023 13:07:31 +1100
From:   Alexey Kardashevskiy <aik@....com>
To:     Sean Christopherson <seanjc@...gle.com>,
        Chao Peng <chao.p.peng@...ux.intel.com>
Cc:     kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
        linux-arch@...r.kernel.org, linux-api@...r.kernel.org,
        linux-doc@...r.kernel.org, qemu-devel@...gnu.org,
        Paolo Bonzini <pbonzini@...hat.com>,
        Jonathan Corbet <corbet@....net>,
        Vitaly Kuznetsov <vkuznets@...hat.com>,
        Wanpeng Li <wanpengli@...cent.com>,
        Jim Mattson <jmattson@...gle.com>,
        Joerg Roedel <joro@...tes.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Arnd Bergmann <arnd@...db.de>,
        Naoya Horiguchi <naoya.horiguchi@....com>,
        Miaohe Lin <linmiaohe@...wei.com>, x86@...nel.org,
        "H . Peter Anvin" <hpa@...or.com>, Hugh Dickins <hughd@...gle.com>,
        Jeff Layton <jlayton@...nel.org>,
        "J . Bruce Fields" <bfields@...ldses.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Shuah Khan <shuah@...nel.org>, Mike Rapoport <rppt@...nel.org>,
        Steven Price <steven.price@....com>,
        "Maciej S . Szmigiero" <mail@...iej.szmigiero.name>,
        Vlastimil Babka <vbabka@...e.cz>,
        Vishal Annapurve <vannapurve@...gle.com>,
        Yu Zhang <yu.c.zhang@...ux.intel.com>,
        "Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
        luto@...nel.org, jun.nakajima@...el.com, dave.hansen@...el.com,
        ak@...ux.intel.com, david@...hat.com, aarcange@...hat.com,
        ddutile@...hat.com, dhildenb@...hat.com,
        Quentin Perret <qperret@...gle.com>, tabba@...gle.com,
        Michael Roth <michael.roth@....com>, mhocko@...e.com,
        wei.w.wang@...el.com
Subject: Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to
 create restricted user memory

On 14/1/23 08:54, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
>> The system call is currently wired up for x86 arch.
> 
> Building on other architectures (except for arm64 for some reason) yields:
> 
>    CALL    /.../scripts/checksyscalls.sh
>    <stdin>:1565:2: warning: #warning syscall memfd_restricted not implemented [-Wcpp]
> 
> Do we care?  It's the only such warning, which makes me think we either need to
> wire this up for all architectures, or explicitly document that it's unsupported.
> 
>> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
>> Signed-off-by: Chao Peng <chao.p.peng@...ux.intel.com>
>> ---
> 
> ...
> 
>> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
>> new file mode 100644
>> index 000000000000..c2700c5daa43
>> --- /dev/null
>> +++ b/include/linux/restrictedmem.h
>> @@ -0,0 +1,71 @@
>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>> +#ifndef _LINUX_RESTRICTEDMEM_H
> 
> Missing
> 
>   #define _LINUX_RESTRICTEDMEM_H
> 
> which causes fireworks if restrictedmem.h is included more than once.
> 
>> +#include <linux/file.h>
>> +#include <linux/magic.h>
>> +#include <linux/pfn_t.h>
> 
> ...
> 
>> +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
>> +					 struct page **pagep, int *order)
>> +{
>> +	return -1;
> 
> This should be a proper -errno, though in the current incarnation of things it's
> a moot point because no stub is needed.  KVM can (and should) easily provide its
> own stub for this one.
> 
>> +}
>> +
>> +static inline bool file_is_restrictedmem(struct file *file)
>> +{
>> +	return false;
>> +}
>> +
>> +static inline void restrictedmem_error_page(struct page *page,
>> +					    struct address_space *mapping)
>> +{
>> +}
>> +
>> +#endif /* CONFIG_RESTRICTEDMEM */
>> +
>> +#endif /* _LINUX_RESTRICTEDMEM_H */
> 
> ...
> 
>> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
>> new file mode 100644
>> index 000000000000..56953c204e5c
>> --- /dev/null
>> +++ b/mm/restrictedmem.c
>> @@ -0,0 +1,318 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#include "linux/sbitmap.h"
>> +#include <linux/pagemap.h>
>> +#include <linux/pseudo_fs.h>
>> +#include <linux/shmem_fs.h>
>> +#include <linux/syscalls.h>
>> +#include <uapi/linux/falloc.h>
>> +#include <uapi/linux/magic.h>
>> +#include <linux/restrictedmem.h>
>> +
>> +struct restrictedmem_data {
> 
> Any objection to simply calling this "restrictedmem"?  And then using either "rm"
> or "rmem" for local variable names?  I kept reading "data" as the underyling data
> being written to the page, as opposed to the metadata describing the restrictedmem
> instance.
> 
>> +	struct mutex lock;
>> +	struct file *memfd;
>> +	struct list_head notifiers;
>> +};
>> +
>> +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
>> +					   pgoff_t start, pgoff_t end)
>> +{
>> +	struct restrictedmem_notifier *notifier;
>> +
>> +	mutex_lock(&data->lock);
> 
> This can be a r/w semaphore instead of a mutex, that way punching holes at multiple
> points in the file can at least run the notifiers in parallel.  The actual allocation
> by shmem will still be serialized, but I think it's worth the simple optimization
> since zapping and flushing in KVM may be somewhat slow.
> 
>> +	list_for_each_entry(notifier, &data->notifiers, list) {
>> +		notifier->ops->invalidate_start(notifier, start, end);
> 
> Two major design issues that we overlooked long ago:
> 
>    1. Blindly invoking notifiers will not scale.  E.g. if userspace configures a
>       VM with a large number of convertible memslots that are all backed by a
>       single large restrictedmem instance, then converting a single page will
>       result in a linear walk through all memslots.  I don't expect anyone to
>       actually do something silly like that, but I also never expected there to be
>       a legitimate usecase for thousands of memslots.
> 
>    2. This approach fails to provide the ability for KVM to ensure a guest has
>       exclusive access to a page.  As discussed in the past, the kernel can rely
>       on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
>       only for SNP and TDX VMs.  For VMs where userspace is trusted to some extent,
>       e.g. SEV, there is value in ensuring a 1:1 association.
> 
>       And probably more importantly, relying on hardware for SNP and TDX yields a
>       poor ABI and complicates KVM's internals.  If the kernel doesn't guarantee a
>       page is exclusive to a guest, i.e. if userspace can hand out the same page
>       from a restrictedmem instance to multiple VMs, then failure will occur only
>       when KVM tries to assign the page to the second VM.  That will happen deep
>       in KVM, which means KVM needs to gracefully handle such errors, and it means
>       that KVM's ABI effectively allows plumbing garbage into its memslots.
> 
> Rather than use a simple list of notifiers, this appears to be yet another
> opportunity to use an xarray.  Supporting sharing of restrictedmem will be
> non-trivial, but IMO we should punt that to the future since it's still unclear
> exactly how sharing will work.
> 
> An xarray will solve #1 by notifying only the consumers (memslots) that are bound
> to the affected range.
> 
> And for #2, it's relatively straightforward (knock wood) to detect existing
> entries, i.e. if the user wants exclusive access to memory, then the bind operation
> can be reject if there's an existing entry.
> 
> VERY lightly tested code snippet at the bottom (will provide link to fully worked
> code in cover letter).
> 
> 
>> +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
>> +				     loff_t offset, loff_t len)
>> +{
>> +	int ret;
>> +	pgoff_t start, end;
>> +	struct file *memfd = data->memfd;
>> +
>> +	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
>> +		return -EINVAL;
>> +
>> +	start = offset >> PAGE_SHIFT;
>> +	end = (offset + len) >> PAGE_SHIFT;
>> +
>> +	restrictedmem_invalidate_start(data, start, end);
>> +	ret = memfd->f_op->fallocate(memfd, mode, offset, len);
>> +	restrictedmem_invalidate_end(data, start, end);
> 
> The lock needs to be end for the entire duration of the hole punch, i.e. needs to
> be taken before invalidate_start() and released after invalidate_end().  If a user
> (un)binds/(un)registers after invalidate_state(), it will see an unpaired notification,
> e.g. could leave KVM with incorrect notifier counts.
> 
>> +
>> +	return ret;
>> +}
> 
> What I ended up with for an xarray-based implementation.  I'm very flexible on
> names and whatnot, these are just what made sense to me.
> 
> static long restrictedmem_punch_hole(struct restrictedmem *rm, int mode,
> 				     loff_t offset, loff_t len)
> {
> 	struct restrictedmem_notifier *notifier;
> 	struct file *memfd = rm->memfd;
> 	unsigned long index;
> 	pgoff_t start, end;
> 	int ret;
> 
> 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> 		return -EINVAL;
> 
> 	start = offset >> PAGE_SHIFT;
> 	end = (offset + len) >> PAGE_SHIFT;
> 
> 	/*
> 	 * Bindings must stable across invalidation to ensure the start+end
> 	 * are balanced.
> 	 */
> 	down_read(&rm->lock);
> 
> 	xa_for_each_range(&rm->bindings, index, notifier, start, end)
> 		notifier->ops->invalidate_start(notifier, start, end);
> 
> 	ret = memfd->f_op->fallocate(memfd, mode, offset, len);
> 
> 	xa_for_each_range(&rm->bindings, index, notifier, start, end)
> 		notifier->ops->invalidate_end(notifier, start, end);
> 
> 	up_read(&rm->lock);
> 
> 	return ret;
> }
> 
> int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
> 		       struct restrictedmem_notifier *notifier, bool exclusive)
> {
> 	struct restrictedmem *rm = file->f_mapping->private_data;
> 	int ret = -EINVAL;
> 
> 	down_write(&rm->lock);
> 
> 	/* Non-exclusive mappings are not yet implemented. */
> 	if (!exclusive)
> 		goto out_unlock;
> 
> 	if (!xa_empty(&rm->bindings)) {
> 		if (exclusive != rm->exclusive)
> 			goto out_unlock;
> 
> 		if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
> 			goto out_unlock;
> 	}
> 
> 	xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);


|| ld: mm/restrictedmem.o: in function `restrictedmem_bind':
mm/restrictedmem.c|295| undefined reference to `xa_store_range'


This is missing:
===
diff --git a/mm/Kconfig b/mm/Kconfig
index f952d0172080..03aca542c0da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1087,6 +1087,7 @@ config SECRETMEM
  config RESTRICTEDMEM
         bool
         depends on TMPFS
+       select XARRAY_MULTI
===

Thanks,



> 	rm->exclusive = exclusive;
> 	ret = 0;
> out_unlock:
> 	up_write(&rm->lock);
> 	return ret;
> }
> EXPORT_SYMBOL_GPL(restrictedmem_bind);
> 
> void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
> 			  struct restrictedmem_notifier *notifier)
> {
> 	struct restrictedmem *rm = file->f_mapping->private_data;
> 
> 	down_write(&rm->lock);
> 	xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
> 	synchronize_rcu();
> 	up_write(&rm->lock);
> }
> EXPORT_SYMBOL_GPL(restrictedmem_unbind);

-- 
Alexey