[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f1bf4b452cc10281ef831c5e38ce16f09923f8c5.1746040540.git.lorenzo.stoakes@oracle.com>
Date: Wed, 30 Apr 2025 20:54:11 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: David Hildenbrand <david@...hat.com>,
"Liam R . Howlett" <Liam.Howlett@...cle.com>,
Vlastimil Babka <vbabka@...e.cz>, Mike Rapoport <rppt@...nel.org>,
Suren Baghdasaryan --cc=Michal Hocko <"surenb@...gle.commhocko"@suse.com>,
Jann Horn <jannh@...gle.com>, Pedro Falcato <pfalcato@...e.de>,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>
Subject: [RFC PATCH 1/3] mm: introduce new .mmap_proto() f_op callback
Provide a means by which drivers can specify which fields of those
permitted to be changed should be altered to prior to mmap()'ing a
range (which may either result from a merge or from mapping an entirely new
VMA).
Doing so is substantially safer than the existing .mmap() calback which
provides unrestricted access to the part-constructed VMA and permits
drivers and file systems to do 'creative' things which makes it hard to
reason about the state of the VMA after the function returns.
The existing .mmap() callback's freedom has caused a great deal of issues,
especially in error handling, as unwinding the mmap() state has proven to
be non-trivial and caused significant issues in the past, for instance
those addressed in commit 5de195060b2e ("mm: resolve faulty mmap_region()
error path behaviour").
It also necessitates a second attempt at merge once the .mmap() callback
has completed, which has caused issues in the past, is awkward, adds
overhead and is difficult to reason about.
The .mmap_proto() callback eliminates this requirement, as we can update
fields prior to even attempting the first merge. It is safer, as we heavily
restrict what can actually be modified, and being invoked very early in the
mmap() process, error handling can be performed safely with very little
unwinding of state required.
Update vma userland test stubs to account for changes.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
---
include/linux/fs.h | 7 +++
include/linux/mm_types.h | 24 +++++++++
mm/memory.c | 3 +-
mm/mmap.c | 2 +-
mm/vma.c | 87 +++++++++++++++++++++++++++++++-
tools/testing/vma/vma_internal.h | 38 ++++++++++++++
6 files changed, 158 insertions(+), 3 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 016b0fe1536e..f8ccdf5419fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2169,6 +2169,7 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
+ int (*mmap_proto)(struct vma_proto *);
} __randomize_layout;
/* Supports async buffered reads */
@@ -2243,6 +2244,12 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
return file->f_op->mmap(file, vma);
}
+/* Does the file have an .mmap() hook? */
+static inline bool file_has_mmap_hook(struct file *file)
+{
+ return file->f_op->mmap || file->f_op->mmap_proto;
+}
+
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e76bade9ebb1..b21d01efc541 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -763,6 +763,30 @@ struct vma_numab_state {
int prev_scan_seq;
};
+/*
+ * Describes a prototype VMA that is about to be mmap()'ed. Drivers may choose
+ * to manipulate non-const fields, which will cause those fields to be updated
+ * in the resultant VMA.
+ *
+ * Helper functions are not required for manipulating any field.
+ */
+struct vma_proto {
+ /* Immutable state. */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+
+ /* Mutable fields. Populated with initial state. */
+ pgoff_t pgoff;
+ struct file *file;
+ vm_flags_t flags;
+ pgprot_t page_prot;
+
+ /* Write-only fields. */
+ const struct vm_operations_struct *vm_ops;
+ void *private_data;
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
diff --git a/mm/memory.c b/mm/memory.c
index 68c1d962d0ad..98a20565825b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -527,10 +527,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_proto: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_proto : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
diff --git a/mm/mmap.c b/mm/mmap.c
index 81dd962a1cfc..411309c7b235 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -475,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!file_has_mmap_hook(file))
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
diff --git a/mm/vma.c b/mm/vma.c
index 1f2634b29568..76bd3a67ce0f 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -17,6 +17,11 @@ struct mmap_state {
unsigned long pglen;
unsigned long flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_proto(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
bool retry_merge;
@@ -40,6 +45,7 @@ struct mmap_state {
.pglen = PHYS_PFN(len_), \
.flags = flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -2384,7 +2390,17 @@ static int __mmap_new_file_vma(struct mmap_state *map,
struct vma_iterator *vmi = map->vmi;
int error;
+ VM_WARN_ON(!file_has_mmap_hook(map->file));
+
vma->vm_file = get_file(map->file);
+
+ /*
+ * The caller might only define .mmap_proto(), in which case we have
+ * nothing further to do.
+ */
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2441,7 +2457,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2528,6 +2544,69 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
+/* Does the driver backing this implement an .mmap_proto() hook? */
+static bool have_mmap_proto_hook(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ return file && file->f_op->mmap_proto;
+}
+
+/*
+ * Invoke the f_op->mmap_proto() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_proto(struct mmap_state *map)
+{
+ int err;
+ const struct file_operations *f_op = map->file->f_op;
+ struct vma_proto proto = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .flags = map->flags,
+ };
+
+ /* Invoke the hook. */
+ err = f_op->mmap_proto(&proto);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = proto.pgoff;
+ map->file = proto.file;
+ map->flags = proto.flags;
+ map->page_prot = proto.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = proto.vm_ops;
+ map->vm_private_data = proto.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ /*
+ * If the .mmap() handler set these, that takes precedent (indicated by
+ * the vma fields being non-empty).
+ */
+ if (map->vm_ops && vma->vm_ops == &vma_dummy_vm_ops)
+ vma->vm_ops = map->vm_ops;
+ if (!vma->vm_private_data)
+ vma->vm_private_data = map->vm_private_data;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2537,8 +2616,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
int error;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ bool have_proto = have_mmap_proto_hook(&map);
error = __mmap_prepare(&map, uf);
+ if (!error && have_proto)
+ error = call_proto(&map);
if (error)
goto abort_munmap;
@@ -2556,6 +2638,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
+ if (have_proto)
+ set_vma_user_defined_fields(vma, &map);
+
/* If flags changed, we might be able to merge, so try again. */
if (map.retry_merge) {
struct vm_area_struct *merged;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 198abe66de5a..56a49d455949 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -253,8 +253,40 @@ struct mm_struct {
unsigned long flags; /* Must use atomic bitops to access */
};
+struct vm_area_struct;
+
+/*
+ * Describes a prototype VMA that is about to be mmap()'ed. Drivers may choose
+ * to manipulate non-const fields, which will cause those fields to be updated
+ * in the resultant VMA.
+ *
+ * Helper functions are not required for manipulating any field.
+ */
+struct vma_proto {
+ /* Immutable state. */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+
+ /* Mutable fields. Populated with initial state. */
+ pgoff_t pgoff;
+ struct file *file;
+ vm_flags_t flags;
+ pgprot_t page_prot;
+
+ /* Write-only fields. */
+ const struct vm_operations_struct *vm_ops;
+ void *private_data;
+};
+
+struct file_operations {
+ int (*mmap)(struct file *, struct vm_area_struct *);
+ int (*mmap_proto)(struct vma_proto *);
+};
+
struct file {
struct address_space *f_mapping;
+ const struct file_operations *f_op;
};
#define VMA_LOCK_OFFSET 0x40000000
@@ -1405,4 +1437,10 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma)
(void)vma;
}
+/* Does the file have an .mmap() hook? */
+static inline bool file_has_mmap_hook(struct file *file)
+{
+ return file->f_op->mmap || file->f_op->mmap_proto;
+}
+
#endif /* __MM_VMA_INTERNAL_H */
--
2.49.0
Powered by blists - more mailing lists