lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2cb841ca-2a04-f088-cee2-6c020ecc9508@redhat.com>
Date:   Mon, 6 Sep 2021 11:35:01 +0200
From:   David Hildenbrand <david@...hat.com>
To:     Huang Shijie <shijie@...amperecomputing.com>,
        viro@...iv.linux.org.uk
Cc:     akpm@...ux-foundation.org, jlayton@...nel.org,
        bfields@...ldses.org, torvalds@...ux-foundation.org,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, song.bao.hua@...ilicon.com,
        patches@...erecomputing.com, zwang@...erecomputing.com
Subject: Re: [RFC PATCH] fs/exec: Add the support for ELF program's NUMA
 replication

On 06.09.21 18:16, Huang Shijie wrote:
> This patch adds AT_NUMA_REPLICATION for execveat().
> 
> If this flag is set, the kernel will trigger COW(copy on write)
> on the mmapped ELF binary. So the program will have a copied-page
> on its NUMA node, even if the original page in page cache is
> on other NUMA nodes.

Am I missing something important or is this just absolutely not what we 
want?

This means that for each and every invocation of the binary, we'll COW 
the complete binary -- an awful lot of wasted main memory and swap.

This is not a NUMA replication, this is en executable ELF code replication.

> 
> Signed-off-by: Huang Shijie <shijie@...amperecomputing.com>
> ---
>   fs/binfmt_elf.c            | 27 ++++++++++++++++++++++-----
>   fs/exec.c                  |  5 ++++-
>   include/linux/binfmts.h    |  1 +
>   include/linux/mm.h         |  2 ++
>   include/uapi/linux/fcntl.h |  2 ++
>   mm/mprotect.c              |  2 +-
>   6 files changed, 32 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 439ed81e755a..fac8f4a4555a 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -362,13 +362,14 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>   
>   static unsigned long elf_map(struct file *filep, unsigned long addr,
>   		const struct elf_phdr *eppnt, int prot, int type,
> -		unsigned long total_size)
> +		unsigned long total_size, int numa_replication)
>   {
>   	unsigned long map_addr;
>   	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
>   	addr = ELF_PAGESTART(addr);
>   	size = ELF_PAGEALIGN(size);
> +	int ret;
>   
>   	/* mmap() will return -EINVAL if given a zero size, but a
>   	 * segment with zero filesize is perfectly valid */
> @@ -385,11 +386,26 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
>   	*/
>   	if (total_size) {
>   		total_size = ELF_PAGEALIGN(total_size);
> -		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +
> +		if (numa_replication) {
> +			/* Trigger the COW for this ELF code section */
> +			map_addr = vm_mmap(filep, addr, total_size, prot | PROT_WRITE,
> +						type | MAP_POPULATE, off);
> +			if (!IS_ERR_VALUE(map_addr) && !(prot & PROT_WRITE)) {
> +				/* Change back */
> +				ret = do_mprotect_pkey(map_addr, total_size, prot, -1);
> +				if (ret)
> +					return ret;
> +			}
> +		} else {
> +			map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
> +		}
> +
>   		if (!BAD_ADDR(map_addr))
>   			vm_munmap(map_addr+size, total_size-size);
> -	} else
> +	} else {
>   		map_addr = vm_mmap(filep, addr, size, prot, type, off);
> +	}
>   
>   	if ((type & MAP_FIXED_NOREPLACE) &&
>   	    PTR_ERR((void *)map_addr) == -EEXIST)
> @@ -635,7 +651,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   				load_addr = -vaddr;
>   
>   			map_addr = elf_map(interpreter, load_addr + vaddr,
> -					eppnt, elf_prot, elf_type, total_size);
> +					eppnt, elf_prot, elf_type, total_size, 0);
>   			total_size = 0;
>   			error = map_addr;
>   			if (BAD_ADDR(map_addr))
> @@ -1139,7 +1155,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   		}
>   
>   		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
> -				elf_prot, elf_flags, total_size);
> +				elf_prot, elf_flags, total_size,
> +				bprm->support_numa_replication);
>   		if (BAD_ADDR(error)) {
>   			retval = IS_ERR((void *)error) ?
>   				PTR_ERR((void*)error) : -EINVAL;
> diff --git a/fs/exec.c b/fs/exec.c
> index 38f63451b928..d27efa540641 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -900,7 +900,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
>   		.lookup_flags = LOOKUP_FOLLOW,
>   	};
>   
> -	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
> +	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_NUMA_REPLICATION)) != 0)
>   		return ERR_PTR(-EINVAL);
>   	if (flags & AT_SYMLINK_NOFOLLOW)
>   		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
> @@ -1828,6 +1828,9 @@ static int bprm_execve(struct linux_binprm *bprm,
>   	if (retval)
>   		goto out;
>   
> +	/* Do we support NUMA replication for this program? */
> +	bprm->support_numa_replication = flags & AT_NUMA_REPLICATION;
> +
>   	retval = exec_binprm(bprm);
>   	if (retval < 0)
>   		goto out;
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 049cf9421d83..1874e1732f20 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -64,6 +64,7 @@ struct linux_binprm {
>   	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
>   
>   	char buf[BINPRM_BUF_SIZE];
> +	int support_numa_replication;
>   } __randomize_layout;
>   
>   #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..76611381be2a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3244,6 +3244,8 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
>   #endif
>   
>   extern int sysctl_nr_trim_pages;
> +int do_mprotect_pkey(unsigned long start, size_t len,
> +			unsigned long prot, int pkey);
>   
>   #ifdef CONFIG_PRINTK
>   void mem_dump_obj(void *object);
> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> index 2f86b2ad6d7e..de99c5ae8eca 100644
> --- a/include/uapi/linux/fcntl.h
> +++ b/include/uapi/linux/fcntl.h
> @@ -111,4 +111,6 @@
>   
>   #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
>   
> +#define AT_NUMA_REPLICATION	0x10000	/* Support NUMA replication for the ELF program */
> +
>   #endif /* _UAPI_LINUX_FCNTL_H */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 883e2cc85cad..d1f8cececfed 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -519,7 +519,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
>   /*
>    * pkey==-1 when doing a legacy mprotect()
>    */
> -static int do_mprotect_pkey(unsigned long start, size_t len,
> +int do_mprotect_pkey(unsigned long start, size_t len,
>   		unsigned long prot, int pkey)
>   {
>   	unsigned long nstart, end, tmp, reqprot;
> 


-- 
Thanks,

David / dhildenb

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ