[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJqdLrogefL5ZkxJfbQ75u45BFFJxttJd1V4yf=KUPxdHg7ocg@mail.gmail.com>
Date: Tue, 10 Feb 2026 21:13:39 +0100
From: Alexander Mikhalitsyn <alexander@...alicyn.com>
To: Andrei Vagin <avagin@...gle.com>
Cc: Kees Cook <kees@...nel.org>, Andrew Morton <akpm@...ux-foundation.org>,
Cyrill Gorcunov <gorcunov@...il.com>, Mike Rapoport <rppt@...nel.org>, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-mm@...ck.org, criu@...ts.linux.dev,
Chen Ridong <chenridong@...wei.com>, Christian Brauner <brauner@...nel.org>,
David Hildenbrand <david@...nel.org>, Eric Biederman <ebiederm@...ssion.com>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, Michal Koutny <mkoutny@...e.com>
Subject: Re: [PATCH 2/4] exec: inherit HWCAPs from the parent process
Am Mo., 9. Feb. 2026 um 20:06 Uhr schrieb Andrei Vagin <avagin@...gle.com>:
>
> Introduces a mechanism to inherit hardware capabilities (AT_HWCAP,
> AT_HWCAP2, etc.) from a parent process when they have been modified via
> prctl.
>
> To support C/R operations (snapshots, live migration) in heterogeneous
> clusters, we must ensure that processes utilize CPU features available
> on all potential target nodes. To solve this, we need to advertise a
> common feature set across the cluster.
>
> This patch adds a new mm flag MMF_USER_HWCAP, which is set when the
> auxiliary vector is modified via prctl(PR_SET_MM, PR_SET_MM_AUXV). When
> execve() is called, if the current process has MMF_USER_HWCAP set, the
> HWCAP values are extracted from the current auxiliary vector and stored
> in the linux_binprm structure. These values are then used to populate
> the auxiliary vector of the new process, effectively inheriting the
> hardware capabilities.
>
> The inherited HWCAPs are masked with the hardware capabilities supported
> by the current kernel to ensure that we don't report more features than
> actually supported. This is important to avoid unexpected behavior,
> especially for processes with additional privileges.
>
> Signed-off-by: Andrei Vagin <avagin@...gle.com>
Cool stuff, LGTM!
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@...urfusion.io>
> ---
> fs/binfmt_elf.c | 8 +++---
> fs/binfmt_elf_fdpic.c | 8 +++---
> fs/exec.c | 61 ++++++++++++++++++++++++++++++++++++++++
> include/linux/binfmts.h | 11 ++++++++
> include/linux/mm_types.h | 2 ++
> kernel/fork.c | 3 ++
> kernel/sys.c | 5 +++-
> 7 files changed, 89 insertions(+), 9 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 3eb734c192e9..aec129e33f0b 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -246,7 +246,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> */
> ARCH_DLINFO;
> #endif
> - NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> + NEW_AUX_ENT(AT_HWCAP, bprm->hwcap);
> NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
> NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
> NEW_AUX_ENT(AT_PHDR, phdr_addr);
> @@ -264,13 +264,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
> NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
> #ifdef ELF_HWCAP2
> - NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
> + NEW_AUX_ENT(AT_HWCAP2, bprm->hwcap2);
> #endif
> #ifdef ELF_HWCAP3
> - NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
> + NEW_AUX_ENT(AT_HWCAP3, bprm->hwcap3);
> #endif
> #ifdef ELF_HWCAP4
> - NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
> + NEW_AUX_ENT(AT_HWCAP4, bprm->hwcap4);
> #endif
> NEW_AUX_ENT(AT_EXECFN, bprm->exec);
> if (k_platform) {
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index a3d4e6973b29..55b482f03c82 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -629,15 +629,15 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
> */
> ARCH_DLINFO;
> #endif
> - NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> + NEW_AUX_ENT(AT_HWCAP, bprm->hwcap);
> #ifdef ELF_HWCAP2
> - NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
> + NEW_AUX_ENT(AT_HWCAP2, bprm->hwcap2);
> #endif
> #ifdef ELF_HWCAP3
> - NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
> + NEW_AUX_ENT(AT_HWCAP3, bprm->hwcap3);
> #endif
> #ifdef ELF_HWCAP4
> - NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
> + NEW_AUX_ENT(AT_HWCAP4, bprm->hwcap4);
> #endif
> NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
> NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
> diff --git a/fs/exec.c b/fs/exec.c
> index 9d5ebc9d15b0..7401efbe4ba0 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1462,6 +1462,17 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
> */
> bprm->is_check = !!(flags & AT_EXECVE_CHECK);
>
> + bprm->hwcap = ELF_HWCAP;
> +#ifdef ELF_HWCAP2
> + bprm->hwcap2 = ELF_HWCAP2;
> +#endif
> +#ifdef ELF_HWCAP3
> + bprm->hwcap3 = ELF_HWCAP3;
> +#endif
> +#ifdef ELF_HWCAP4
> + bprm->hwcap4 = ELF_HWCAP4;
> +#endif
> +
> retval = bprm_mm_init(bprm);
> if (!retval)
> return bprm;
> @@ -1780,6 +1791,53 @@ static int bprm_execve(struct linux_binprm *bprm)
> return retval;
> }
>
> +static void inherit_hwcap(struct linux_binprm *bprm)
> +{
> + int i, n;
> +
> +#ifdef ELF_HWCAP4
> + n = 4;
> +#elif defined(ELF_HWCAP3)
> + n = 3;
> +#elif defined(ELF_HWCAP2)
> + n = 2;
> +#else
> + n = 1;
> +#endif
> +
> + for (i = 0; n && i < AT_VECTOR_SIZE; i += 2) {
> + long val = current->mm->saved_auxv[i + 1];
> +
> + switch (current->mm->saved_auxv[i]) {
> + case AT_NULL:
> + goto done;
> + case AT_HWCAP:
> + bprm->hwcap = val & ELF_HWCAP;
> + break;
> +#ifdef ELF_HWCAP2
> + case AT_HWCAP2:
> + bprm->hwcap2 = val & ELF_HWCAP2;
> + break;
> +#endif
> +#ifdef ELF_HWCAP3
> + case AT_HWCAP3:
> + bprm->hwcap3 = val & ELF_HWCAP3;
> + break;
> +#endif
> +#ifdef ELF_HWCAP4
> + case AT_HWCAP4:
> + bprm->hwcap4 = val & ELF_HWCAP4;
> + break;
> +#endif
> + default:
> + continue;
> + }
> + n--;
> + }
> +done:
> + mm_flags_set(MMF_USER_HWCAP, bprm->mm);
> +}
> +
> static int do_execveat_common(int fd, struct filename *filename,
> struct user_arg_ptr argv,
> struct user_arg_ptr envp,
> @@ -1856,6 +1914,9 @@ static int do_execveat_common(int fd, struct filename *filename,
> current->comm, bprm->filename);
> }
>
> + if (mm_flags_test(MMF_USER_HWCAP, current->mm))
> + inherit_hwcap(bprm);
> +
> retval = bprm_execve(bprm);
> out_free:
> free_bprm(bprm);
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 65abd5ab8836..94a3dcf9b1d2 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_BINFMTS_H
> #define _LINUX_BINFMTS_H
>
> +#include <linux/elf.h>
> #include <linux/sched.h>
> #include <linux/unistd.h>
> #include <asm/exec.h>
> @@ -67,6 +68,16 @@ struct linux_binprm {
> unsigned long exec;
>
> struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
> + unsigned long hwcap;
> +#ifdef ELF_HWCAP2
> + unsigned long hwcap2;
> +#endif
> +#ifdef ELF_HWCAP3
> + unsigned long hwcap3;
> +#endif
> +#ifdef ELF_HWCAP4
> + unsigned long hwcap4;
> +#endif
>
> char buf[BINPRM_BUF_SIZE];
> } __randomize_layout;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 78950eb8926d..68c9131dceee 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1871,6 +1871,8 @@ enum {
> #define MMF_TOPDOWN 31 /* mm searches top down by default */
> #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
>
> +#define MMF_USER_HWCAP 32 /* user-defined HWCAPs */
> +
> #define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
> MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
> MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b1f3915d5f8e..0091315643de 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1103,6 +1103,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>
> __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
> mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
> +
> + if (mm_flags_test(MMF_USER_HWCAP, current->mm))
> + mm_flags_set(MMF_USER_HWCAP, mm);
> } else {
> __mm_flags_overwrite_word(mm, default_dump_filter);
> mm->def_flags = 0;
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8d199cf457ae..6fbd7be21a5f 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2157,8 +2157,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
> * not introduce additional locks here making the kernel
> * more complex.
> */
> - if (prctl_map.auxv_size)
> + if (prctl_map.auxv_size) {
> memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
> + mm_flags_set(MMF_USER_HWCAP, current->mm);
> + }
>
> mmap_read_unlock(mm);
> return 0;
> @@ -2190,6 +2192,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
>
> task_lock(current);
> memcpy(mm->saved_auxv, user_auxv, len);
> + mm_flags_set(MMF_USER_HWCAP, current->mm);
nit: s/current->mm/mm/
There is no issue, because this function assumes mm == current->mm implicitly.
Maybe we should get rid of (struct mm_struct *mm) argument here? (not
a suggestion for change
of this patch, but just mentioning it here).
LGTM!
> task_unlock(current);
>
> return 0;
> --
> 2.53.0.239.g8d8fc8a987-goog
>
Powered by blists - more mailing lists