[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2cb841ca-2a04-f088-cee2-6c020ecc9508@redhat.com>
Date: Mon, 6 Sep 2021 11:35:01 +0200
From: David Hildenbrand <david@...hat.com>
To: Huang Shijie <shijie@...amperecomputing.com>,
viro@...iv.linux.org.uk
Cc: akpm@...ux-foundation.org, jlayton@...nel.org,
bfields@...ldses.org, torvalds@...ux-foundation.org,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, song.bao.hua@...ilicon.com,
patches@...erecomputing.com, zwang@...erecomputing.com
Subject: Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA
replication
On 06.09.21 18:16, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
>
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.
Am I missing something important or is this just absolutely not what we
want?
This means that for each and every invocation of the binary, we'll COW
the complete binary -- an awful lot of wasted main memory and swap.
This is not a NUMA replication, this is en executable ELF code replication.
>
> Signed-off-by: Huang Shijie <shijie@...amperecomputing.com>
> ---
> fs/binfmt_elf.c | 27 ++++++++++++++++++++++-----
> fs/exec.c | 5 ++++-
> include/linux/binfmts.h | 1 +
> include/linux/mm.h | 2 ++
> include/uapi/linux/fcntl.h | 2 ++
> mm/mprotect.c | 2 +-
> 6 files changed, 32 insertions(+), 7 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 439ed81e755a..fac8f4a4555a 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>
> static unsigned long elf_map(struct file *filep, unsigned long addr,
> const struct elf_phdr *eppnt, int prot, int type,
> - unsigned long total_size)
> + unsigned long total_size, int numa_replication)
> {
> unsigned long map_addr;
> unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
> unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
> addr = ELF_PAGESTART(addr);
> size = ELF_PAGEALIGN(size);
> + int ret;
>
> /* mmap() will return -EINVAL if given a zero size, but a
> * segment with zero filesize is perfectly valid */
> @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
> */
> if (total_size) {
> total_size = ELF_PAGEALIGN(total_size);
> - map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +
> + if (numa_replication) {
> + /* Trigger the COW for this ELF code section */
> + map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
> + type | MAP_POPULATE, off);
> + if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
> + /* Change back */
> + ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
> + if (ret)
> + return ret;
> + }
> + } else {
> + map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> + }
> +
> if (!BAD_ADDR(map_addr))
> vm_munmap(map_addr+size, total_size-size);
> - } else
> + } else {
> map_addr = vm_mmap(filep, addr, size, prot, type, off);
> + }
>
> if ((type & MAP_FIXED_NOREPLACE) &&
> PTR_ERR((void *)map_addr) == -EEXIST)
> @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
> load_addr = -vaddr;
>
> map_addr = elf_map(interpreter, load_addr + vaddr,
> - eppnt, elf_prot, elf_type, total_size);
> + eppnt, elf_prot, elf_type, total_size, 0);
> total_size = 0;
> error = map_addr;
> if (BAD_ADDR(map_addr))
> @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
> }
>
> error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
> - elf_prot, elf_flags, total_size);
> + elf_prot, elf_flags, total_size,
> + bprm->support_numa_replication);
> if (BAD_ADDR(error)) {
> retval = IS_ERR((void *)error) ?
> PTR_ERR((void*)error) : -EINVAL;
> diff --git a/fs/exec.c b/fs/exec.c
> index 38f63451b928..d27efa540641 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
> .lookup_flags = LOOKUP_FOLLOW,
> };
>
> - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
> return ERR_PTR(-EINVAL);
> if (flags & AT_SYMLINK_NOFOLLOW)
> open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
> @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
> if (retval)
> goto out;
>
> + /* Do we support NUMA replication for this program? */
> + bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
> +
> retval = exec_binprm(bprm);
> if (retval < 0)
> goto out;
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 049cf9421d83..1874e1732f20 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -64,6 +64,7 @@ struct linux_binprm {
> struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
>
> char buf[BINPRM_BUF_SIZE];
> + int support_numa_replication;
> } __randomize_layout;
>
> #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..76611381be2a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
> #endif
>
> extern int sysctl_nr_trim_pages;
> +int do_mprotect_pkey(unsigned long start, size_t len,
> + unsigned long prot, int pkey);
>
> #ifdef CONFIG_PRINTK
> void mem_dump_obj(void *object);
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..de99c5ae8eca 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -111,4 +111,6 @@
>
> #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
>
> +#define AT_NUMA_REPLICATION 0x10000 /* Support NUMA replication for the ELF program */
> +
> #endif /* _UAPI_LINUX_FCNTL_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 883e2cc85cad..d1f8cececfed 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
> /*
> * pkey==-1 when doing a legacy mprotect()
> */
> -static int do_mprotect_pkey(unsigned long start, size_t len,
> +int do_mprotect_pkey(unsigned long start, size_t len,
> unsigned long prot, int pkey)
> {
> unsigned long nstart, end, tmp, reqprot;
>
--
Thanks,
David / dhildenb
Powered by blists - more mailing lists