[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241206212846.210835-1-lorenzo.stoakes@oracle.com>
Date: Fri, 6 Dec 2024 21:28:46 +0000
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: "Liam R . Howlett" <Liam.Howlett@...cle.com>,
Muchun Song <muchun.song@...ux.dev>, Vlastimil Babka <vbabka@...e.cz>,
Jann Horn <jannh@...gle.com>, Hugh Dickins <hughd@...gle.com>,
linux-mm@...ck.org, linux-kernel@...r.kernel.org,
Isaac Manjarres <isaacmanjarres@...gle.com>,
Kalesh Singh <kaleshsingh@...gle.com>
Subject: [PATCH] mm: perform all memfd seal checks in a single place
We no longer actually need to perform these checks in the f_op->mmap() hook
any longer.
We already moved the operation which clears VM_MAYWRITE on a read-only
mapping of a write-sealed memfd in order to work around the restrictions
imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error
path behaviour").
There is no reason for us not to simply go ahead and additionally check to
see if any pre-existing seals are in place here rather than defer this to
the f_op->mmap() hook.
By doing this we remove more logic from shmem_mmap() which doesn't belong
there, as well as doing the same for hugetlbfs_file_mmap(). We also remove
dubious shared logic in mm.h which simply does not belong there either.
It makes sense to do these checks at the earliest opportunity, we know
these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not
change from the invoking do_mmap() so there is simply no need to wait.
This also means the implementation of further memfd seal flags can be done
within mm/memfd.c and also have the opportunity to modify VMA flags as
necessary early in the mapping logic.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
---
fs/hugetlbfs/inode.c | 5 ----
include/linux/memfd.h | 22 ++++++++---------
include/linux/mm.h | 55 -------------------------------------------
mm/memfd.c | 44 +++++++++++++++++++++++++++++++++-
mm/mmap.c | 12 +++++++---
mm/shmem.c | 6 -----
6 files changed, 62 insertions(+), 82 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a5ea006f403e..d8f852b52c56 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -99,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
@@ -116,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED);
vma->vm_ops = &hugetlb_vm_ops;
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
-
/*
* page based offset in vm_pgoff could be sufficiently large to
* overflow a loff_t when converted to byte offset. This can
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index d437e3070850..d53408b0bd31 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -7,7 +7,14 @@
#ifdef CONFIG_MEMFD_CREATE
extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
-unsigned int *memfd_file_seals_ptr(struct file *file);
+/*
+ * Check for any existing seals on mmap, return an error if access is denied due
+ * to sealing, or 0 otherwise.
+ *
+ * We also update VMA flags if appropriate by manipulating the VMA flags pointed
+ * to by vm_flags_ptr.
+ */
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr);
#else
static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
{
@@ -17,19 +24,10 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
{
return ERR_PTR(-EINVAL);
}
-
-static inline unsigned int *memfd_file_seals_ptr(struct file *file)
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags)
{
- return NULL;
+ return 0;
}
#endif
-/* Retrieve memfd seals associated with the file, if any. */
-static inline unsigned int memfd_file_seals(struct file *file)
-{
- unsigned int *sealsp = memfd_file_seals_ptr(file);
-
- return sealsp ? *sealsp : 0;
-}
-
#endif /* __LINUX_MEMFD_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd2b33157d5b..d642f637cef8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4091,61 +4091,6 @@ void mem_dump_obj(void *object);
static inline void mem_dump_obj(void *object) {}
#endif
-static inline bool is_write_sealed(int seals)
-{
- return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
-}
-
-/**
- * is_readonly_sealed - Checks whether write-sealed but mapped read-only,
- * in which case writes should be disallowing moving
- * forwards.
- * @seals: the seals to check
- * @vm_flags: the VMA flags to check
- *
- * Returns whether readonly sealed, in which case writess should be disallowed
- * going forward.
- */
-static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags)
-{
- /*
- * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (is_write_sealed(seals) &&
- ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED))
- return true;
-
- return false;
-}
-
-/**
- * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
- * handle them.
- * @seals: the seals to check
- * @vma: the vma to operate on
- *
- * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
- * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
- */
-static inline int seal_check_write(int seals, struct vm_area_struct *vma)
-{
- if (!is_write_sealed(seals))
- return 0;
-
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * write seals are active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- return 0;
-}
-
#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in,
diff --git a/mm/memfd.c b/mm/memfd.c
index 35a370d75c9a..5f5a23c9051d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
return error;
}
-unsigned int *memfd_file_seals_ptr(struct file *file)
+static unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
@@ -327,6 +327,48 @@ static int check_sysctl_memfd_noexec(unsigned int *flags)
return 0;
}
+static inline bool is_write_sealed(unsigned int seals)
+{
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(unsigned long *vm_flags_ptr)
+{
+ unsigned long vm_flags = *vm_flags_ptr;
+ unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+ /* If a private matting then writability is irrelevant. */
+ if (!(mask & VM_SHARED))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if (mask & VM_WRITE)
+ return -EPERM;
+
+ /*
+ * This is a read-only mapping, disallow mprotect() from making a
+ * write-sealed mapping writable in future.
+ */
+ *vm_flags_ptr &= ~VM_MAYWRITE;
+
+ return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+{
+ int err = 0;
+ unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+ unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+ if (is_write_sealed(seals))
+ err = check_write_seal(vm_flags_ptr);
+
+ return err;
+}
+
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
diff --git a/mm/mmap.c b/mm/mmap.c
index 1c6bdffa13dd..902fe4266448 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -368,8 +368,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
- unsigned int seals = memfd_file_seals(file);
unsigned long flags_mask;
+ int err;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
@@ -409,8 +409,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
- else if (is_readonly_sealed(seals, vm_flags))
- vm_flags &= ~VM_MAYWRITE;
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
@@ -430,6 +428,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
default:
return -EINVAL;
}
+
+ /*
+ * Check to see if we are violating any seals and update VMA
+ * flags if necessary to avoid future seal violations.
+ */
+ err = memfd_check_seals_mmap(file, &vm_flags);
+ if (err)
+ return (unsigned long)err;
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
diff --git a/mm/shmem.c b/mm/shmem.c
index f4e9c94ed8ae..41d7a181ed89 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2815,12 +2815,6 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
- int ret;
-
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
--
2.47.1
Powered by blists - more mailing lists