[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7718ec5b-0a9e-ffa6-16f2-bc0b6afbd9ab@gmail.com>
Date: Fri, 28 May 2021 10:07:02 -0700
From: "Lin, Ming" <minggr@...il.com>
To: Linus Torvalds <torvalds@...ux-foundation.org>,
Simon Ser <contact@...rsion.fr>
Cc: Peter Xu <peterx@...hat.com>,
"Kirill A. Shutemov" <kirill@...temov.name>,
Matthew Wilcox <willy@...radead.org>,
Dan Williams <dan.j.williams@...el.com>,
"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
Will Deacon <will@...nel.org>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
David Herrmann <dh.herrmann@...il.com>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
Greg Kroah-Hartman <greg@...ah.com>,
"tytso@....edu" <tytso@....edu>
Subject: Re: Sealed memfd & no-fault mmap
On 5/5/2021 11:42 AM, Linus Torvalds wrote:
> On Wed, May 5, 2021 at 3:21 AM Simon Ser <contact@...rsion.fr> wrote:
>>>
>>> Is there some very specific and targeted pattern for that "shared
>>> mapping" case? For example, if it's always a shared anonymous mapping
>>> with no filesystem backing, then that would possibly be a simpler case
>>> than the "random arbitrary shared file descriptor".
>>
>> Yes. I don't know of any Wayland client using buffers with real
>> filesystem backing. I think the main cases are:
>>
>> - shm_open(3) immediately followed by shm_unlink(3). On Linux, this is
>> implemented with /dev/shm which is a tmpfs.
>> - Abusing /tmp or /run's tmpfs by creating a file there and unlinking
>> it immediately afterwards. Kind of similar to the first case.
>> - memfd_create(2) on Linux.
>>
>> Is this enough to make it work on shared memory mappings? Is it
>> important that the mapping is anonymous?
>
> All of those should be anonymous in the sense that the backing store
> is all the kernel's notion of anonymous pages, and there is no actual
> file backing. The mappings may then be shared, of course.
>
> So that does make Peter's idea to have some inode flag for "don't
> SIGBUS on fault" be more reasonable, because there isn't some random
> actual filesystem involved, only the core VM layer.
>
> I'm not going to write the patch, though, but maybe you can convince
> somebody else to try it..
Does something like following draft patch on the right track?
1. Application set S_NOFAULT flag on shm mmap fd
#define S_NOFAULT (1 << 17)
fd = shm_open(shmpath, O_RDONLY, S_IRUSR | S_IWUSR);
ioctl(fd, FS_IOC_GETFLAGS, &flags);
flags |= S_NOFAULT;
ioctl(fd, FS_IOC_SETFLAGS, &flags)
2. Don't SIGBUS on read beyond i_size if S_NOFAULT flag set in inode.
Use zero page instead.
---
[RFC DRAFT PATCH] shm: no SIGBUS fault on out-of-band mmap read
---
include/linux/fs.h | 2 ++
mm/shmem.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..a9be7cd71b94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2202,6 +2202,7 @@ struct super_operations {
#define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
+#define S_NOFAULT (1 << 17) /* No SIGBUS fault on out-of-band mmap read */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2244,6 +2245,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY)
+#define IS_NOFAULT(inode) ((inode)->i_flags & S_NOFAULT)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d46611cba8d..856d2d8d4cdf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,8 +38,11 @@
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/fileattr.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <asm/pgalloc.h>
static struct vfsmount *shm_mnt;
@@ -1812,7 +1815,27 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- return -EINVAL;
+ unsigned long dst_addr = vmf->address;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ if (!IS_NOFAULT(inode))
+ return -EINVAL;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ vma->vm_page_prot));
+ dst_pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, dst_addr, &ptl);
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(vma->vm_mm, dst_addr, dst_pte, _dst_pte);
+ update_mmu_cache(vma, dst_addr, dst_pte);
+ *fault_type = VM_FAULT_NOPAGE;
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
}
sbinfo = SHMEM_SB(inode->i_sb);
@@ -3819,6 +3842,23 @@ const struct address_space_operations shmem_aops = {
};
EXPORT_SYMBOL(shmem_aops);
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+
+ fileattr_fill_flags(fa, inode->i_flags);
+
+ return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ inode->i_flags = fa->flags;
+ return 0;
+}
+
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
.get_unmapped_area = shmem_get_unmapped_area,
@@ -3836,6 +3876,8 @@ static const struct file_operations shmem_file_operations = {
static const struct inode_operations shmem_inode_operations = {
.getattr = shmem_getattr,
.setattr = shmem_setattr,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
.set_acl = simple_set_acl,
Powered by blists - more mailing lists