lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 6 Oct 2022 16:44:16 -0700
From:   Luis Chamberlain <mcgrof@...nel.org>
To:     Song Liu <song@...nel.org>
Cc:     linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        akpm@...ux-foundation.org, x86@...nel.org, peterz@...radead.org,
        hch@....de, kernel-team@...com, rick.p.edgecombe@...el.com,
        dave.hansen@...el.com
Subject: Re: [RFC 4/5] vmalloc_exec: share a huge page with kernel text

On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
> On x86 kernel, we allocate 2MB pages for kernel text up to
> round_down(_etext, 2MB). Therefore, some of the kernel text is still
> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> round_up(_etext, 2MB), and use the rest of the page for modules and
> BPF programs.
> 
> Here is an example:
> 
> [root@...50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
> 
> [root@...50-1 ~]# grep bpf_prog_ /proc/kallsyms  | tail -n 3
> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup       [bpf]
> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new   [bpf]
> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch       [bpf]
> 
> [root@...50-1 ~]#  grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-0xffffffff82400000     2M     ro   PSE         x  pmd
> 
> [root@...50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> ffffffff822ba910 t xfs_flush_inodes_worker      [xfs]
> ffffffff822bc580 t xfs_flush_inodes     [xfs]
> 
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
> module, and bpf programs.

This is pretty rad. I'm not sure how you were able to squeeze xfs and
*more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
xfs is 3.6847 MiB. How big is your XFS module?

I don't grok mm stuff, but I'd like to understand why we gain the ability
of re-use the same 2 MiB page with this patch, from the code I really
can't tail. Any pointers?

But, I'm still concerned about the free'ing case in terms of
fragmentation for contigous memory, when free huage pages are available.

  Luis

> ---
>  arch/x86/mm/init_64.c |  3 ++-
>  mm/vmalloc.c          | 27 +++++++++++++++++++++++++++
>  2 files changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 39c5246964a9..d27d0af5beb5 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>  
>  int kernel_set_to_readonly;
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>  void mark_rodata_ro(void)
>  {
>  	unsigned long start = PFN_ALIGN(_text);
>  	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>  	unsigned long end = (unsigned long)__end_rodata_hpage_align;
> -	unsigned long text_end = PFN_ALIGN(_etext);
> +	unsigned long text_end = PMD_ALIGN(_etext);
>  	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>  	unsigned long all_end;
>  
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 472287e71bf1..5f3b5df9313f 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>  static const bool vmap_allow_huge = false;
>  #endif	/* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>  
> +#define PMD_ALIGN(x)	(((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
> +
> +static struct vm_struct text_tail_vm;
> +static struct vmap_area text_tail_va;
> +
>  bool is_vmalloc_addr(const void *x)
>  {
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>  	unsigned long addr = (unsigned long)kasan_reset_tag(x);
>  	if (addr >= MODULES_VADDR && addr < MODULES_END)
>  		return 1;
> +	if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
> +		return 1;
>  #endif
>  	return is_vmalloc_addr(x);
>  }
> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> +static void register_text_tail_vm(void)
> +{
> +	unsigned long start = PFN_ALIGN(_etext);
> +	unsigned long end = PMD_ALIGN(_etext);
> +	struct vmap_area *va;
> +
> +	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> +	if (WARN_ON_ONCE(!va))
> +		return;
> +	text_tail_vm.addr = (void *)start;
> +	text_tail_vm.size = end - start;
> +	text_tail_vm.flags = VM_KERNEL_EXEC;
> +	text_tail_va.va_start = start;
> +	text_tail_va.va_end = end;
> +	text_tail_va.vm = &text_tail_vm;
> +	memcpy(va, &text_tail_va, sizeof(*va));
> +	insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +}
> +
>  void __init vmalloc_init(void)
>  {
>  	struct vmap_area *va;
> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>  	 * Create the cache for vmap_area objects.
>  	 */
>  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
> +	register_text_tail_vm();
>  
>  	for_each_possible_cpu(i) {
>  		struct vmap_block_queue *vbq;
> -- 
> 2.30.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ