lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <Yz9hhTRzMr0ZEnA/@bombadil.infradead.org>
Date:   Thu, 6 Oct 2022 16:15:17 -0700
From:   Luis Chamberlain <mcgrof@...nel.org>
To:     Song Liu <song@...nel.org>, Vlastimil Babka <vbabka@...e.cz>,
        Mel Gorman <mgorman@...e.de>
Cc:     linux-mm@...ck.org, linux-kernel@...r.kernel.org,
        akpm@...ux-foundation.org, x86@...nel.org, peterz@...radead.org,
        hch@....de, kernel-team@...com, rick.p.edgecombe@...el.com,
        dave.hansen@...el.com
Subject: Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec

On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>  }
>  EXPORT_SYMBOL(vm_map_pages_zero);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	return NULL;
> +}

Well that's not so nice for no-mmu systems. Shouldn't we have a
fallback?

> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index effd1ff6a4b4..472287e71bf1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->va_end = addr + size;
>  	va->vm = NULL;
>  
> -	spin_lock(&vmap_area_lock);
> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> -	spin_unlock(&vmap_area_lock);
> +	if (vm_flags & VM_KERNEL_EXEC) {
> +		spin_lock(&free_text_area_lock);
> +		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +		/* update subtree_max_size now as we need this soon */
> +		augment_tree_propagate_from(va);

Sorry, it is not clear to me why its needed only for exec, can you elaborate a
bit more?

> +		spin_unlock(&free_text_area_lock);
> +	} else {
> +		spin_lock(&vmap_area_lock);
> +		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +		spin_unlock(&vmap_area_lock);
> +	}
>  
>  	BUG_ON(!IS_ALIGNED(va->va_start, align));
>  	BUG_ON(va->va_start < vstart);

<-- snip -->

> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
>  }
>  EXPORT_SYMBOL(vmalloc);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	struct vmap_area *va, *tmp;
> +	unsigned long addr;
> +	enum fit_type type;
> +	int ret;
> +
> +	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
> +	if (unlikely(!va))
> +		return ERR_PTR(-ENOMEM);
> +
> +again:
> +	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
> +	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
> +				     size, align, 1, false);
> +
> +	if (!tmp) {
> +		unsigned long alloc_size;
> +		void *ptr;
> +
> +		spin_unlock(&free_text_area_lock);
> +
> +		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
> +		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
> +					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
> +					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
> +					   NUMA_NO_NODE, __builtin_return_address(0));

We can review the guard stuff on the other thread with Peter.

> +		if (unlikely(!ptr)) {
> +			ret = -ENOMEM;
> +			goto err_out;
> +		}
> +		memset(ptr, 0, alloc_size);
> +		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
> +		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);

I *really* like that this is now not something users have to muck with thanks!

> +
> +		goto again;
> +	}
> +
> +	addr = roundup(tmp->va_start, align);
> +	type = classify_va_fit_type(tmp, addr, size);
> +	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
> +		addr = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
> +				    tmp, addr, size, type);
> +	if (ret) {
> +		addr = ret;
> +		goto err_out;
> +	}
> +	spin_unlock(&free_text_area_lock);
> +
> +	va->va_start = addr;
> +	va->va_end = addr + size;
> +	va->vm = tmp->vm;
> +
> +	spin_lock(&vmap_area_lock);
> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +	spin_unlock(&vmap_area_lock);
> +
> +	return (void *)addr;
> +
> +err_out:
> +	spin_unlock(&free_text_area_lock);
> +	return ERR_PTR(ret);
> +}
> +
> +void vfree_exec(const void *addr)
> +{
> +	struct vmap_area *va;
> +
> +	might_sleep();
> +
> +	spin_lock(&vmap_area_lock);
> +	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
> +	if (WARN_ON_ONCE(!va)) {
> +		spin_unlock(&vmap_area_lock);
> +		return;
> +	}
> +
> +	unlink_va(va, &vmap_area_root);

Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
I realize other code doesn't seem to do it, though.

> +	spin_unlock(&vmap_area_lock);
> +
> +	spin_lock(&free_text_area_lock);
> +	merge_or_add_vmap_area_augment(va,
> +		&free_text_area_root, &free_text_area_list);

I have concern that we can be using precious physically contigous memory
from huge pages to then end up in a situation where we create our own
pool and allow things to be non-contigous afterwards.

I'm starting to suspect that if the allocation is > PAGE_SIZE we just
give it back generally. Otherwise wouldn't the fragmentation cause us
to eventually just eat up most huge pages available? Probably not for
eBPF but if we use this on a system with tons of module insertions /
deletions this seems like it could happen?

  Luis

> +	spin_unlock(&free_text_area_lock);
> +	/* TODO: when the whole vm_struct is not in use, free it */
> +}
> +
>  /**
>   * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
>   * @size:      allocation size
> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  			/* It is a BUG(), but trigger recovery instead. */
>  			goto recovery;
>  
> -		ret = adjust_va_to_fit_type(va, start, size, type);
> +		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
> +					    va, start, size, type);
>  		if (unlikely(ret))
>  			goto recovery;
>  
> -- 
> 2.30.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ