[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241105165515.154941-1-shivankg@amd.com>
Date: Tue, 5 Nov 2024 16:55:13 +0000
From: Shivank Garg <shivankg@....com>
To: <x86@...nel.org>, <viro@...iv.linux.org.uk>, <brauner@...nel.org>,
<jack@...e.cz>, <akpm@...ux-foundation.org>, <linux-kernel@...r.kernel.org>,
<linux-fsdevel@...r.kernel.org>, <linux-mm@...ck.org>,
<linux-api@...r.kernel.org>, <linux-arch@...r.kernel.org>,
<kvm@...r.kernel.org>
CC: <chao.gao@...el.com>, <pgonda@...gle.com>, <thomas.lendacky@....com>,
<seanjc@...gle.com>, <luto@...nel.org>, <tglx@...utronix.de>,
<mingo@...hat.com>, <bp@...en8.de>, <dave.hansen@...ux.intel.com>,
<willy@...radead.org>, <arnd@...db.de>, <pbonzini@...hat.com>,
<kees@...nel.org>, <shivankg@....com>, <bharata@....com>, <nikunj@....com>,
<michael.day@....com>, <Neeraj.Upadhyay@....com>, Shivansh Dhiman
<shivansh.dhiman@....com>
Subject: [RFC PATCH 2/4] Introduce fbind syscall
The new fbind syscall sets the NUMA memory policy for file-backed memory
and has following signature:
long fbind(unsigned int fd, unsigned long mode,
const unsigned long nodemask[(.maxnode + ULONG_WIDTH - 1)
/ ULONG_WIDTH],
unsigned long maxnode, unsigned int flags);
fbind behaves similar to mbind except that it takes file descriptor as
input instead of address ranges.
TODO:
1. Support fbind syscall on all architectures.
2. Expand commit msg and add documentation.
3. clean-up the code.
[Shivansh: add create_mpol_from_args()]
Signed-off-by: Shivansh Dhiman <shivansh.dhiman@....com>
Signed-off-by: Shivank Garg <shivankg@....com>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/fs.h | 3 ++
include/linux/mempolicy.h | 3 ++
include/linux/syscalls.h | 3 ++
include/uapi/asm-generic/unistd.h | 5 ++-
kernel/sys_ni.c | 1 +
mm/Makefile | 2 +-
mm/fbind.c | 49 +++++++++++++++++++++++
mm/mempolicy.c | 55 ++++++++++++++++++++++++++
10 files changed, 121 insertions(+), 2 deletions(-)
create mode 100644 mm/fbind.c
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 534c74b14fab..0660ce6d08d8 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -468,3 +468,4 @@
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
461 i386 lsm_list_modules sys_lsm_list_modules
462 i386 mseal sys_mseal
+463 i386 fbind sys_fbind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee21c0d1..9794347cc2e6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -386,6 +386,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common fbind sys_fbind
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..42042b62bdcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2058,6 +2058,9 @@ struct file_operations {
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
+#ifdef CONFIG_NUMA
+ int (*set_policy)(struct file *, struct mempolicy *);
+#endif
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1add16f21612..b9023f6246a7 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -299,4 +299,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
}
#endif /* CONFIG_NUMA */
+struct mempolicy *create_mpol_from_args(unsigned char mode,
+ const unsigned long __user *nmask,
+ unsigned short maxnode);
#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4bcf6754738d..2dc686921b9f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -502,6 +502,9 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu
asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
struct stat __user *statbuf, int flag);
asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
+asmlinkage long sys_fbind(unsigned int fd, unsigned long mode,
+ const unsigned long __user *nmask,
+ unsigned long maxnode, unsigned int flags);
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148cac2b..550730f36dae 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,11 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
#define __NR_mseal 462
__SYSCALL(__NR_mseal, sys_mseal)
+#define __NR_fbind 463
+__SYSCALL(__NR_fbind, sys_fbind)
+
#undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 464
/*
* 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..f57350e581f6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -195,6 +195,7 @@ COND_SYSCALL(move_pages);
COND_SYSCALL(set_mempolicy_home_node);
COND_SYSCALL(cachestat);
COND_SYSCALL(mseal);
+COND_SYSCALL(fbind);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
diff --git a/mm/Makefile b/mm/Makefile
index d2915f8c9dc0..ba339ddc0be2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,7 +79,7 @@ obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
-obj-$(CONFIG_NUMA) += mempolicy.o
+obj-$(CONFIG_NUMA) += mempolicy.o fbind.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/fbind.c b/mm/fbind.c
new file mode 100644
index 000000000000..85ec7d13345c
--- /dev/null
+++ b/mm/fbind.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implement fbind() syscall.
+ *
+ * Copyright (c) 2024 AMD
+ *
+ * Author: Shivank Garg <shivankg@....com>
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+
+static long do_fbind(unsigned int fd, unsigned long mode,
+ const unsigned long __user *nmask,
+ unsigned long maxnode, unsigned int flags)
+{
+ struct mempolicy *mpol;
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ mpol = create_mpol_from_args(mode, nmask, maxnode);
+ if (IS_ERR_OR_NULL(mpol)) {
+ ret = PTR_ERR(mpol);
+ goto out_putf;
+ }
+
+ if (f.file->f_op->set_policy)
+ ret = f.file->f_op->set_policy(f.file, mpol);
+ else
+ ret = -EOPNOTSUPP;
+
+ mpol_put(mpol);
+out_putf:
+ fdput(f);
+ return ret;
+}
+
+SYSCALL_DEFINE5(fbind, unsigned int, fd, unsigned long, mode,
+ const unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned int, flags)
+{
+ return do_fbind(fd, mode, nmask, maxnode, flags);
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b858e22b259d..3a697080ecad 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3557,3 +3557,58 @@ static int __init mempolicy_sysfs_init(void)
late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */
+
+/**
+ * create_mpol_from_args - create a mempolicy structure from args
+ * @mode: NUMA memory policy mode
+ * @nmask: bitmask of NUMA nodes
+ * @maxnode: number of bits in the nodes bitmask
+ *
+ * Create a mempolicy from given nodemask and memory policy such as
+ * default, preferred, interleave or bind.
+ *
+ * Return: error encoded in a pointer or memory policy on success.
+ */
+struct mempolicy *create_mpol_from_args(unsigned char mode,
+ const unsigned long __user *nmask,
+ unsigned short maxnode)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned short mode_flags;
+ struct mempolicy *mpol;
+ nodemask_t nodes;
+ int lmode = mode;
+ int err = -ENOMEM;
+
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return ERR_PTR(err);
+
+ mpol = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR_OR_NULL(mpol))
+ return mpol;
+
+ NODEMASK_SCRATCH(scratch);
+ if (!scratch) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mmap_write_lock(mm);
+ err = mpol_set_nodemask(mpol, &nodes, scratch);
+ mmap_write_unlock(mm);
+ NODEMASK_SCRATCH_FREE(scratch);
+
+ if (err)
+ goto err_out;
+
+ return mpol;
+
+err_out:
+ mpol_put(mpol);
+ return ERR_PTR(err);
+}
--
2.34.1
Powered by blists - more mailing lists