linux-kernel - Re: [PATCH RFC] vm_unmap_aliases: allow callers to inhibit TLB flush

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Mon, 23 Feb 2009 20:13:42 +1100
From:	Nick Piggin <nickpiggin@...oo.com.au>
To:	Jeremy Fitzhardinge <jeremy@...p.org>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Linux Memory Management List <linux-mm@...ck.org>,
	"the arch/x86 maintainers" <x86@...nel.org>,
	Arjan van de Ven <arjan@...ux.intel.com>
Subject: Re: [PATCH RFC] vm_unmap_aliases: allow callers to inhibit TLB flush

On Monday 23 February 2009 18:30:14 Jeremy Fitzhardinge wrote:
> Nick Piggin wrote:
> > On Friday 20 February 2009 06:11:32 Jeremy Fitzhardinge wrote:
> >> Nick Piggin wrote:
> >>> Then what is the point of the vm_unmap_aliases? If you are doing it
> >>> for security it won't work because other CPUs might still be able
> >>> to write through dangling TLBs. If you are not doing it for
> >>> security then it does not need to be done at all.
> >>
> >> Xen will make sure any danging tlb entries are flushed before handing
> >> the page out to anyone else.
> >>
> >>> Unless it is something strange that Xen does with the page table
> >>> structure and you just need to get rid of those?
> >>
> >> Yeah.  A pte pointing at a page holds a reference on it, saying that it
> >> belongs to the domain.  You can't return it to Xen until the refcount is
> >> 0.
> >
> > OK. Then I will remember to find some time to get the interrupt
> > safe patches working. I wonder why you can't just return it to
> > Xen when (or have Xen hold it somewhere until) the refcount
> > reaches 0?
>
> It would still need to allocate a page in the meantime, which could fail
> because the domain has hit its hard memory limit (which will be the
> common case, because a domain generally starts with its full compliment
> of memory).   The nice thing about the exchange is that there's no
> accounting to take into account.

OK, well I don't really understand the details but I trust you if
you say it's hard :)


> >>> Or... what if we just allow a compile and/or boot time flag to direct
> >>> that it does not want lazy vmap unmapping and it will just revert to
> >>> synchronous unmapping? If Xen needs lots of flushing anyway it might
> >>> not be a win anyway.
> >>
> >> That may be worth considering.
> >
> > ... in the meantime, shall we just do this for Xen? It is probably
> > safer and may end up with no worse performance on Xen anyway. If
> > we get more vmap users and it becomes important, you could look at
> > more sophisticated ways of doing this. Eg. a page could be flagged
> > if it potentially has lazy vmaps.
>
> OK.  Do you want to do the patch, or shall I?

Here's a start for you. I think it gets rid of all the dead code and
data without introducing any actual conditional compilation...

---
 mm/vmalloc.c |   66 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 18 deletions(-)

Index: linux-2.6/mm/vmalloc.c
===================================================================
--- linux-2.6.orig/mm/vmalloc.c
+++ linux-2.6/mm/vmalloc.c
@@ -29,6 +29,11 @@
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_VMAP_NO_LAZY_FLUSH
+#define VMAP_LAZY_FLUSHES 0
+#else
+#define VMAP_LAZY_FLUSHES 1
+#endif
 
 /*** Page table manipulation functions ***/
 
@@ -376,7 +381,7 @@ retry:
 found:
 	if (addr + size > vend) {
 		spin_unlock(&vmap_area_lock);
-		if (!purged) {
+		if (VMAP_LAZY_FLUSHES && !purged) {
 			purge_vmap_area_lazy();
 			purged = 1;
 			goto retry;
@@ -413,7 +418,10 @@ static void __free_vmap_area(struct vmap
 	RB_CLEAR_NODE(&va->rb_node);
 	list_del_rcu(&va->list);
 
-	call_rcu(&va->rcu_head, rcu_free_va);
+	if (VMAP_LAZY_FLUSHES)
+		call_rcu(&va->rcu_head, rcu_free_va);
+	else
+		kfree(va);
 }
 
 /*
@@ -450,8 +458,10 @@ static void vmap_debug_free_range(unsign
 	 * faster).
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	vunmap_page_range(start, end);
-	flush_tlb_kernel_range(start, end);
+	if (VMAP_LAZY_FLUSHES) {
+		vunmap_page_range(start, end);
+		flush_tlb_kernel_range(start, end);
+	}
 #endif
 }
 
@@ -571,10 +581,16 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 {
-	va->flags |= VM_LAZY_FREE;
-	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
-	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-		try_purge_vmap_area_lazy();
+	if (VMAP_LAZY_FLUSHES) {
+		va->flags |= VM_LAZY_FREE;
+		atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT,
+							&vmap_lazy_nr);
+		if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+			try_purge_vmap_area_lazy();
+	} else {
+		vunmap_page_range(va->va_start, va->va_end);
+		flush_tlb_kernel_range(va->va_start, va->va_end);
+	}
 }
 
 /*
@@ -610,6 +626,15 @@ static void free_unmap_vmap_area_addr(un
 /*** Per cpu kva allocator ***/
 
 /*
+ * This does lazy flushing as well, so don't call it if the arch doesn't want
+ * lazy vmap kva flushes... The scalability aspect should be less important
+ * in that case anyway seeing as kernel tlb flushing tends not to be scalable.
+ * It would be possible to make this work without lazy tlb flushing if it
+ * was really a big deal.
+ */
+
+
+/*
  * vmap space is limited especially on 32 bit architectures. Ensure there is
  * room for at least 16 percpu vmap blocks per CPU.
  */
@@ -877,6 +902,9 @@ void vm_unmap_aliases(void)
 	int cpu;
 	int flush = 0;
 
+	if (!VMAP_LAZY_FLUSHES)
+		return;
+
 	if (unlikely(!vmap_initialized))
 		return;
 
@@ -937,7 +965,7 @@ void vm_unmap_ram(const void *mem, unsig
 	debug_check_no_locks_freed(mem, size);
 	vmap_debug_free_range(addr, addr+size);
 
-	if (likely(count <= VMAP_MAX_ALLOC))
+	if (VMAP_LAZY_FLUSHES && likely(count <= VMAP_MAX_ALLOC))
 		vb_free(mem, size);
 	else
 		free_unmap_vmap_area_addr(addr);
@@ -959,7 +987,7 @@ void *vm_map_ram(struct page **pages, un
 	unsigned long addr;
 	void *mem;
 
-	if (likely(count <= VMAP_MAX_ALLOC)) {
+	if (VMAP_LAZY_FLUSHES && likely(count <= VMAP_MAX_ALLOC)) {
 		mem = vb_alloc(size, GFP_KERNEL);
 		if (IS_ERR(mem))
 			return NULL;
@@ -988,14 +1016,16 @@ void __init vmalloc_init(void)
 	struct vm_struct *tmp;
 	int i;
 
-	for_each_possible_cpu(i) {
-		struct vmap_block_queue *vbq;
-
-		vbq = &per_cpu(vmap_block_queue, i);
-		spin_lock_init(&vbq->lock);
-		INIT_LIST_HEAD(&vbq->free);
-		INIT_LIST_HEAD(&vbq->dirty);
-		vbq->nr_dirty = 0;
+	if (VMAP_LAZY_FLUSHES) {
+		for_each_possible_cpu(i) {
+			struct vmap_block_queue *vbq;
+
+			vbq = &per_cpu(vmap_block_queue, i);
+			spin_lock_init(&vbq->lock);
+			INIT_LIST_HEAD(&vbq->free);
+			INIT_LIST_HEAD(&vbq->dirty);
+			vbq->nr_dirty = 0;
+		}
 	}
 
 	/* Import existing vmlist entries. */
.