lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f0232380-4171-f4d3-f1a6-07993e551b46@redhat.com>
Date:   Mon, 3 Apr 2023 10:21:48 +0200
From:   David Hildenbrand <david@...hat.com>
To:     Ackerley Tng <ackerleytng@...gle.com>, kvm@...r.kernel.org,
        linux-api@...r.kernel.org, linux-arch@...r.kernel.org,
        linux-doc@...r.kernel.org, linux-fsdevel@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org,
        qemu-devel@...gnu.org
Cc:     aarcange@...hat.com, ak@...ux.intel.com, akpm@...ux-foundation.org,
        arnd@...db.de, bfields@...ldses.org, bp@...en8.de,
        chao.p.peng@...ux.intel.com, corbet@....net, dave.hansen@...el.com,
        ddutile@...hat.com, dhildenb@...hat.com, hpa@...or.com,
        hughd@...gle.com, jlayton@...nel.org, jmattson@...gle.com,
        joro@...tes.org, jun.nakajima@...el.com,
        kirill.shutemov@...ux.intel.com, linmiaohe@...wei.com,
        luto@...nel.org, mail@...iej.szmigiero.name, mhocko@...e.com,
        michael.roth@....com, mingo@...hat.com, naoya.horiguchi@....com,
        pbonzini@...hat.com, qperret@...gle.com, rppt@...nel.org,
        seanjc@...gle.com, shuah@...nel.org, steven.price@....com,
        tabba@...gle.com, tglx@...utronix.de, vannapurve@...gle.com,
        vbabka@...e.cz, vkuznets@...hat.com, wanpengli@...cent.com,
        wei.w.wang@...el.com, x86@...nel.org, yu.c.zhang@...ux.intel.com
Subject: Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify
 mount for memfd_restricted

On 01.04.23 01:50, Ackerley Tng wrote:
> By default, the backing shmem file for a restrictedmem fd is created
> on shmem's kernel space mount.
> 
> With this patch, an optional tmpfs mount can be specified via an fd,
> which will be used as the mountpoint for backing the shmem file
> associated with a restrictedmem fd.
> 
> This will help restrictedmem fds inherit the properties of the
> provided tmpfs mounts, for example, hugepage allocation hints, NUMA
> binding hints, etc.
> 
> Permissions for the fd passed to memfd_restricted() is modeled after
> the openat() syscall, since both of these allow creation of a file
> upon a mount/directory.
> 
> Permission to reference the mount the fd represents is checked upon fd
> creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
> etc) and any process that can present memfd_restricted() with a valid
> fd is expected to have obtained permission to use the mount
> represented by the fd. This behavior is intended to parallel that of
> the openat() syscall.
> 
> memfd_restricted() will check that the tmpfs superblock is
> writable, and that the mount is also writable, before attempting to
> create a restrictedmem file on the mount.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
> ---
>   include/linux/syscalls.h           |  2 +-
>   include/uapi/linux/restrictedmem.h |  8 ++++
>   mm/restrictedmem.c                 | 74 +++++++++++++++++++++++++++---
>   3 files changed, 77 insertions(+), 7 deletions(-)
>   create mode 100644 include/uapi/linux/restrictedmem.h
> 
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index f9e9e0c820c5..a23c4c385cd3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
>   asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
>   					    unsigned long home_node,
>   					    unsigned long flags);
> -asmlinkage long sys_memfd_restricted(unsigned int flags);
> +asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);
> 
>   /*
>    * Architecture-specific system calls
> diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h
> new file mode 100644
> index 000000000000..22d6f2285f6d
> --- /dev/null
> +++ b/include/uapi/linux/restrictedmem.h
> @@ -0,0 +1,8 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
> +#define _UAPI_LINUX_RESTRICTEDMEM_H
> +
> +/* flags for memfd_restricted */
> +#define RMFD_USERMNT		0x0001U

I wonder if we can come up with a more expressive prefix than RMFD. 
Sounds more like "rm fd" ;) Maybe it should better match the 
"memfd_restricted" syscall name, like "MEMFD_RSTD_USERMNT".


> +
> +#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index c5d869d8c2d8..f7b62364a31a 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -1,11 +1,12 @@
>   // SPDX-License-Identifier: GPL-2.0
> -#include "linux/sbitmap.h"

Looks like an unrelated change?

> +#include <linux/namei.h>
>   #include <linux/pagemap.h>
>   #include <linux/pseudo_fs.h>
>   #include <linux/shmem_fs.h>
>   #include <linux/syscalls.h>
>   #include <uapi/linux/falloc.h>
>   #include <uapi/linux/magic.h>
> +#include <uapi/linux/restrictedmem.h>
>   #include <linux/restrictedmem.h>
> 
>   struct restrictedmem {
> @@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file *memfd)
>   	return file;
>   }
> 
> -SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> +static int restrictedmem_create(struct vfsmount *mount)
>   {
>   	struct file *file, *restricted_file;
>   	int fd, err;
> 
> -	if (flags)
> -		return -EINVAL;
> -
>   	fd = get_unused_fd_flags(0);
>   	if (fd < 0)
>   		return fd;
> 
> -	file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> +	if (mount)
> +		file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE);
> +	else
> +		file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> +
>   	if (IS_ERR(file)) {
>   		err = PTR_ERR(file);
>   		goto err_fd;
> @@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
>   	return err;
>   }
> 
> +static bool is_shmem_mount(struct vfsmount *mnt)
> +{
> +	return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
> +}
> +
> +static bool is_mount_root(struct file *file)
> +{
> +	return file->f_path.dentry == file->f_path.mnt->mnt_root;
> +}

I'd inline at least that function, pretty self-explaining.

> +
> +static int restrictedmem_create_on_user_mount(int mount_fd)
> +{
> +	int ret;
> +	struct fd f;
> +	struct vfsmount *mnt;
> +
> +	f = fdget_raw(mount_fd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	ret = -EINVAL;
> +	if (!is_mount_root(f.file))
> +		goto out;
> +
> +	mnt = f.file->f_path.mnt;
> +	if (!is_shmem_mount(mnt))
> +		goto out;
> +
> +	ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
> +	if (ret)
> +		goto out;
> +
> +	ret = mnt_want_write(mnt);
> +	if (unlikely(ret))
> +		goto out;
> +
> +	ret = restrictedmem_create(mnt);
> +
> +	mnt_drop_write(mnt);
> +out:
> +	fdput(f);
> +
> +	return ret;
> +}
> +
> +SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
> +{
> +	if (flags & ~RMFD_USERMNT)
> +		return -EINVAL;
> +
> +	if (flags == RMFD_USERMNT) {
> +		if (mount_fd < 0)
> +			return -EINVAL;
> +
> +		return restrictedmem_create_on_user_mount(mount_fd);
> +	} else {
> +		return restrictedmem_create(NULL);
> +	}


You can drop the else case:

if (flags == RMFD_USERMNT) {
	...
	return restrictedmem_create_on_user_mount(mount_fd);
}
return restrictedmem_create(NULL);


I do wonder if you want to properly check for a flag instead of 
comparing values. Results in a more natural way to deal with flags:

if (flags & RMFD_USERMNT) {

}

> +}
> +
>   int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
>   		       struct restrictedmem_notifier *notifier, bool exclusive)
>   {

The "memfd_restricted" vs. "restrictedmem" terminology is a bit 
unfortunate, but not your fault here.


I'm not a FS person, but it does look good to me.

-- 
Thanks,

David / dhildenb

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ