[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <agkliam473nmhxirk76psryxh5qkrncdhwzyoyf4w4efkxnubw@vkeini5qa6xw>
Date: Thu, 17 Oct 2024 12:01:32 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Jeff Xu <jeffxu@...omium.org>
Cc: akpm@...ux-foundation.org, keescook@...omium.org, jannh@...gle.com,
torvalds@...ux-foundation.org, adhemerval.zanella@...aro.org,
oleg@...hat.com, linux-kernel@...r.kernel.org,
linux-hardening@...r.kernel.org, linux-mm@...ck.org,
jorgelo@...omium.org, sroettger@...gle.com, ojeda@...nel.org,
adobriyan@...il.com, anna-maria@...utronix.de, mark.rutland@....com,
linus.walleij@...aro.org, Jason@...c4.com, deller@....de,
rdunlap@...radead.org, davem@...emloft.net, hch@....de,
peterx@...hat.com, hca@...ux.ibm.com, f.fainelli@...il.com,
gerg@...nel.org, dave.hansen@...ux.intel.com, mingo@...nel.org,
ardb@...nel.org, mhocko@...e.com, 42.hyeyoo@...il.com,
peterz@...radead.org, ardb@...gle.com, enh@...gle.com,
rientjes@...gle.com, groeck@...omium.org, lorenzo.stoakes@...cle.com
Subject: Re: [RFC PATCH v2 1/1] exec: seal system mappings
> On Wed, Oct 16, 2024 at 6:10 PM Liam R. Howlett <Liam.Howlett@...cle.com> wrote:
...
> > > diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
> > > index 2fb7d53cf333..20a3000550d2 100644
> > > --- a/arch/x86/entry/vsyscall/vsyscall_64.c
> > > +++ b/arch/x86/entry/vsyscall/vsyscall_64.c
> > > @@ -32,6 +32,7 @@
> > > #include <linux/mm_types.h>
> > > #include <linux/syscalls.h>
> > > #include <linux/ratelimit.h>
> > > +#include <linux/fs.h>
> > >
> > > #include <asm/vsyscall.h>
> > > #include <asm/unistd.h>
> > > @@ -366,8 +367,12 @@ void __init map_vsyscall(void)
> > > set_vsyscall_pgtable_user_bits(swapper_pg_dir);
> > > }
> > >
> > > - if (vsyscall_mode == XONLY)
> > > - vm_flags_init(&gate_vma, VM_EXEC);
> > > + if (vsyscall_mode == XONLY) {
> > > + unsigned long vm_flags = VM_EXEC;
> > > +
> > > + update_seal_exec_system_mappings(&vm_flags);
> > > + vm_flags_init(&gate_vma, vm_flags);
> > > + }
> > >
> > > BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
> > > (unsigned long)VSYSCALL_ADDR);
> > > diff --git a/fs/exec.c b/fs/exec.c
> > > index 77364806b48d..5030879cda47 100644
> > > --- a/fs/exec.c
> > > +++ b/fs/exec.c
> >
> > Does it make sense for this to live in exec? Couldn't you put it in the
> > mm/mseal.c file? It's vma flags for mappings and you've put it in
> > fs/exec?
> >
> If you are referring to utilities related to kernel cmdline, they
> should be in this file.
You created a wrapper for the command line, but then included the user
in this file as well.
hugetlbfs reads the command line as well, in cmdline_parse_hugetlb_cma.
That parser lives with the rest of the hugetlb code in hugetlb.c
I think this has to do with your view as this is an exec thing, where I
think it's an mm thing. My arguments are that you are directly adding
flags to vmas and it's dealing with mseal which has memory in the name
with the file living in the mm/ directory. If I wanted to know what's
using mseal, I'd start there and totally miss what you are adding here.
Besides applying a vma flag to exec mappings, why do you feel like it
belongs in fs/ ?
>
> > > @@ -68,6 +68,7 @@
> > > #include <linux/user_events.h>
> > > #include <linux/rseq.h>
> > > #include <linux/ksm.h>
> > > +#include <linux/fs_parser.h>
> > >
> > > #include <linux/uaccess.h>
> > > #include <asm/mmu_context.h>
> > > @@ -2159,3 +2160,55 @@ fs_initcall(init_fs_exec_sysctls);
> > > #ifdef CONFIG_EXEC_KUNIT_TEST
> > > #include "tests/exec_kunit.c"
> > > #endif
> > > +
> > > +#ifdef CONFIG_64BIT
> > > +/*
> > > + * Kernel cmdline overwrite for CONFIG_SEAL_SYSTEM_MAPPINGS_X
> > > + */
> > > +enum seal_system_mappings_type {
> > > + SEAL_SYSTEM_MAPPINGS_NEVER,
> > > + SEAL_SYSTEM_MAPPINGS_ALWAYS
> > > +};
> > > +
> > > +static enum seal_system_mappings_type seal_system_mappings __ro_after_init =
> > > + IS_ENABLED(CONFIG_SEAL_SYSTEM_MAPPINGS_ALWAYS) ? SEAL_SYSTEM_MAPPINGS_ALWAYS :
> > > + SEAL_SYSTEM_MAPPINGS_NEVER;
> > > +
> > > +static const struct constant_table value_table_sys_mapping[] __initconst = {
> > > + { "never", SEAL_SYSTEM_MAPPINGS_NEVER},
> > > + { "always", SEAL_SYSTEM_MAPPINGS_ALWAYS},
> > > + { }
> > > +};
> > > +
> > > +static int __init early_seal_system_mappings_override(char *buf)
> > > +{
> > > + if (!buf)
> > > + return -EINVAL;
> > > +
> > > + seal_system_mappings = lookup_constant(value_table_sys_mapping,
> > > + buf, seal_system_mappings);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +early_param("exec.seal_system_mappings", early_seal_system_mappings_override);
> > > +
> > > +static bool seal_system_mappings_enabled(void)
> > > +{
> > > + if (seal_system_mappings == SEAL_SYSTEM_MAPPINGS_ALWAYS)
> > > + return true;
> > > +
> > > + return false;
> > > +}
> >
> > This function seems unnecessary, it is called from another 3-4 line
> > function only.
> >
> It is more readable this way.
It really isn't - you don't have to cram everything into one if
statement.
>
> > > +
> > > +void update_seal_exec_system_mappings(unsigned long *vm_flags)
> > > +{
> > > + if (!(*vm_flags & VM_SEALED) && seal_system_mappings_enabled())
> >
> > Why !(*vm_flags & VM_SEALED) here?
> >
> If vm_flags is already sealed, then there is no need to check
> seal_system_mappings_enabled.
>
> > > + *vm_flags |= VM_SEALED;
> > > +
> > > +}
> >
> > Instead of passing a pointer around and checking enabled, why don't you
> > have a function that just returns the VM_SEALED or 0 and just or it into
> > the flags? This seems very heavy for what it does, why did you do it
> > this way?
> >
> Why is that heavy ? passing a pointer for updating variables is natural.
What you have here will pass a pointer to a function, dereference the
pointer and potentially call another function to check seal system
mappings is enabled to dereference a pointer to or a bit. I would hope
the compiler improves this, but that depends on the compiler.
What you *could* do is call a function to check the seal system mappings
is enabled and return the bit to set. The caller would just need to or
the bit on value without dereferencing the pointer. We may be able to
even inline the function you call.
So I am suggesting removing two dereferences and a function call (maybe
two?) while making the code more compact and readable. This is what I
mean by heavy.
>
> > The name is also very long and a bit odd, it could be used for other
> > reasons, but you have _system_mappings on the end, and you use seal but
> > it's mseal (or vm_seal)? Would mseal_flag() work?
> >
> It could be longer :-)
> it means update_sealing_flag_for_executable_system_mappings.
> mseal_flag is too short and not descriptive.
mseal_exec_flags() ?
>
> > > +#else
> > > +void update_seal_exec_system_mappings(unsigned long *vm_flags)
> > > +{
> > > +}
> > > +#endif /* CONFIG_64BIT */
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index 42444ec95c9b..6e44aca4b24b 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> >
> > Again, I don't understand why fs.h is the place for mseal definitions?
> >
> include/linux/fs.h contains other exec related function signatures. So
> it is better to keep them at the same header.
I don't see this as an exec related function, it's changing vma flags.
Besides this new function, the fs/exec.c deals with the stack flags.
Well, this header has VM_MAY{READ/WRITE/EXEC} for nommu - but I don't
think we can really count those here and they are used to translate
things anyways.
Recently, we have been trying to modularize the vma code for better
testing. Spreading the code across many files will make that testing
harder to do in isolation.
>
> > > @@ -3079,6 +3079,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
> > > extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
> > > extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
> > > extern struct file * open_exec(const char *);
> > > +extern void update_seal_exec_system_mappings(unsigned long *vm_flags);
> >
> > We are dropping extern where possible now.
> >
> extern can be dropped, it appears not causing link error.
> > >
> > > /* fs/dcache.c -- generic fs support functions */
> > > extern bool is_subdir(struct dentry *, struct dentry *);
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index c47a0bf25e58..e9876fae8887 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -1506,7 +1506,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
> > > }
> > >
> > > vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
> > > - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
> > > + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|VM_SEALED,
> > > &xol_mapping);
> >
> > Changing all uprobes seems like something that should probably be
> > mentioned more than just the note at the end of the change log, even if
> > you think it won't have any impact. The note is even hidden at the end
> > of a paragraph.
> >
> > I would go as far as splitting this patch out as its own so that the
> > subject line specifies that all uprobes will be VM_SEALED now.
> >
> > Maybe it's fine but maybe it isn't and you've buried it so that it will
> > be missed by virtually everyone.
> >
> I will add "It is sealed from creation." in the above "uprobe" section.
That in inadequate for what you are changing. Doing it this way risks
breaking things because no one will notice.
>
> >
> > > if (IS_ERR(vma)) {
> > > ret = PTR_ERR(vma);
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 57fd5ab2abe7..d4717e34a60d 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -2133,6 +2133,7 @@ struct vm_area_struct *_install_special_mapping(
> > > unsigned long addr, unsigned long len,
> > > unsigned long vm_flags, const struct vm_special_mapping *spec)
> > > {
> > > + update_seal_exec_system_mappings(&vm_flags);
> > > return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
> > > &special_mapping_vmops);
> >
> > If you were to return a flag, you could change the vm_flags argument to
> > vm_flags | mseal_flag()
> >
> passing pointer seems to be the most efficient way.
I disagree. Here is the godbolt.org output for gcc x86-64 14.2 of your
code (with some added #defines to make it compile)
seal_system_mappings:
.long 1
seal_system_mappings_enabled:
push rbp
mov rbp, rsp
mov eax, DWORD PTR seal_system_mappings[rip]
cmp eax, 1
jne .L2
mov eax, 1
jmp .L3
.L2:
mov eax, 0
.L3:
pop rbp
ret
update_seal_exec_system_mappings:
push rbp
mov rbp, rsp
sub rsp, 8
mov QWORD PTR [rbp-8], rdi
mov rax, QWORD PTR [rbp-8]
mov rax, QWORD PTR [rax]
and eax, 2
test rax, rax
jne .L6
call seal_system_mappings_enabled
test al, al
je .L6
mov rax, QWORD PTR [rbp-8]
mov rax, QWORD PTR [rax]
or rax, 2
mov rdx, rax
mov rax, QWORD PTR [rbp-8]
mov QWORD PTR [rax], rdx
.L6:
nop
leave
ret
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov QWORD PTR [rbp-8], 0
lea rax, [rbp-8]
mov rdi, rax
call update_seal_exec_system_mappings
mov rax, QWORD PTR [rbp-8]
leave
ret
----- 48 lines -----
Here is what I am suggesting to do with replacing the passing of a
pointer with a concise "vm_flags | mseal_exec_flags()" (with the same
added #defines to make it compile)
seal_system_mappings:
.long 1
mseal_exec_flags:
push rbp
mov rbp, rsp
mov eax, DWORD PTR seal_system_mappings[rip]
cmp eax, 1
jne .L2
mov eax, 2
jmp .L3
.L2:
mov eax, 0
.L3:
pop rbp
ret
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov QWORD PTR [rbp-8], 0
call mseal_exec_flags
mov edx, eax
mov rax, QWORD PTR [rbp-8]
or eax, edx
leave
ret
----- 26 lines -----
So as you can see, there are less instructions in my version; there are
47.92% less lines of assembly.
Your vsyscall version is even worse.
It is very tiresome to do code inspections and be told flat-out "no"
repetitively.
Liam
Powered by blists - more mailing lists