linux-kernel - Re: [patch 00/10] [RFC] SLUB patches for more functionality, performance and maintenance

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0707091715450.2062@schroedinger.engr.sgi.com>
Date:	Mon, 9 Jul 2007 17:55:30 -0700 (PDT)
From:	Christoph Lameter <clameter@....com>
To:	Mathieu Desnoyers <mathieu.desnoyers@...ymtl.ca>
cc:	Martin Bligh <mbligh@...igh.org>, Andi Kleen <andi@...stfloor.org>,
	linux-kernel@...r.kernel.org, linux-mm@...ck.org,
	David Miller <davem@...emloft.net>
Subject: Re: [patch 00/10] [RFC] SLUB patches for more functionality,
 performance and maintenance

Ok here is a replacement patch for the cmpxchg patch. Problems

1. cmpxchg_local is not available on all arches. If we wanted to do
   this then it needs to be universally available.

2. cmpxchg_local does generate the "lock" prefix. It should not do that.
   Without fixes to cmpxchg_local we cannot expect maximum performance.

3. The approach is x86 centric. It relies on a cmpxchg that does not
   synchronize with memory used by other cpus and therefore is more
   lightweight. As far as I know the IA64 cmpxchg cannot do that.
   Neither several other processors. I am not sure how cmpxchgless
   platforms would use that. We need a detailed comparison of
   interrupt enable /disable vs. cmpxchg cycle counts for cachelines in
   the cpu cache to evaluate the impact that such a change would have.

   The cmpxchg (or its emulation) does not need any barriers since the
   accesses can only come from a single processor. 

Mathieu measured a significant performance benefit coming from not using
interrupt enable / disable.

Some rough processor cycle counts (anyone have better numbers?)

	STI	CLI	CMPXCHG
IA32	36	26	1 (assume XCHG == CMPXCHG, sti/cli also need stack pushes/pulls)
IA64	12	12	1 (but ar.ccv needs 11 cycles to set comparator,
			need register moves to preserve processors flags)

Looks like STI/CLI is pretty expensive and it seems that we may be able to
optimize the alloc / free hotpath quite a bit if we could drop the 
interrupt enable / disable. But we need some measurements.


Draft of a new patch:

SLUB: Single atomic instruction alloc/free using cmpxchg_local

A cmpxchg allows us to avoid disabling and enabling interrupts. The cmpxchg
is optimal to allow operations on per cpu freelist. We can stay on one
processor by disabling preemption() and allowing concurrent interrupts
thus avoiding the overhead of disabling and enabling interrupts.

Pro:
	- No need to disable interrupts.
	- Preempt disable /enable vanishes on non preempt kernels
Con:
        - Slightly complexer handling.
	- Updates to atomic instructions needed

Signed-off-by: Christoph Lameter <clameter@....com>

---
 mm/slub.c |   72 ++++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 23 deletions(-)

Index: linux-2.6.22-rc6-mm1/mm/slub.c
===================================================================
--- linux-2.6.22-rc6-mm1.orig/mm/slub.c	2007-07-09 15:04:46.000000000 -0700
+++ linux-2.6.22-rc6-mm1/mm/slub.c	2007-07-09 17:09:00.000000000 -0700
@@ -1467,12 +1467,14 @@ static void *__slab_alloc(struct kmem_ca
 {
 	void **object;
 	struct page *new;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	if (!c->page)
 		goto new_slab;
 
 	slab_lock(c->page);
-	if (unlikely(!node_match(c, node)))
+	if (unlikely(!node_match(c, node) || c->freelist))
 		goto another_slab;
 load_freelist:
 	object = c->page->freelist;
@@ -1486,7 +1488,14 @@ load_freelist:
 	c->page->inuse = s->objects;
 	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
+out:
 	slab_unlock(c->page);
+	local_irq_restore(flags);
+	preempt_enable();
+
+	if (unlikely((gfpflags & __GFP_ZERO)))
+		memset(object, 0, c->objsize);
+
 	return object;
 
 another_slab:
@@ -1527,6 +1536,8 @@ new_slab:
 		c->page = new;
 		goto load_freelist;
 	}
+	local_irq_restore(flags);
+	preempt_enable();
 	return NULL;
 debug:
 	c->freelist = NULL;
@@ -1536,8 +1547,7 @@ debug:
 
 	c->page->inuse++;
 	c->page->freelist = object[c->offset];
-	slab_unlock(c->page);
-	return object;
+	goto out;
 }
 
 /*
@@ -1554,23 +1564,20 @@ static void __always_inline *slab_alloc(
 		gfp_t gfpflags, int node, void *addr)
 {
 	void **object;
-	unsigned long flags;
 	struct kmem_cache_cpu *c;
 
-	local_irq_save(flags);
+	preempt_disable();
 	c = get_cpu_slab(s, smp_processor_id());
-	if (unlikely(!c->page || !c->freelist ||
-					!node_match(c, node)))
+redo:
+	object = c->freelist;
+	if (unlikely(!object || !node_match(c, node)))
+		return __slab_alloc(s, gfpflags, node, addr, c);
 
-		object = __slab_alloc(s, gfpflags, node, addr, c);
+	if (cmpxchg_local(&c->freelist, object, object[c->offset]) != object)
+		goto redo;
 
-	else {
-		object = c->freelist;
-		c->freelist = object[c->offset];
-	}
-	local_irq_restore(flags);
-
-	if (unlikely((gfpflags & __GFP_ZERO) && object))
+	preempt_enable();
+	if (unlikely((gfpflags & __GFP_ZERO)))
 		memset(object, 0, c->objsize);
 
 	return object;
@@ -1603,7 +1610,9 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	slab_lock(page);
 
 	if (unlikely(SlabDebug(page)))
@@ -1629,6 +1638,8 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+	local_irq_restore(flags);
+	preempt_enable();
 	return;
 
 slab_empty:
@@ -1639,6 +1650,8 @@ slab_empty:
 		remove_partial(s, page);
 
 	slab_unlock(page);
+	local_irq_restore(flags);
+	preempt_enable();
 	discard_slab(s, page);
 	return;
 
@@ -1663,18 +1676,31 @@ static void __always_inline slab_free(st
 			struct page *page, void *x, void *addr)
 {
 	void **object = (void *)x;
-	unsigned long flags;
 	struct kmem_cache_cpu *c;
+	void **freelist;
 
-	local_irq_save(flags);
+	preempt_disable();
 	c = get_cpu_slab(s, smp_processor_id());
-	if (likely(page == c->page && c->freelist)) {
-		object[c->offset] = c->freelist;
-		c->freelist = object;
-	} else
-		__slab_free(s, page, x, addr, c->offset);
+redo:
+	freelist = c->freelist;
+	/*
+	 * Must read freelist before c->page. If a interrupt occurs and
+	 * changes c->page after we have read it here then it
+	 * will also have changed c->freelist and the cmpxchg will fail.
+	 *
+	 * If we would have checked c->page first then the freelist could
+	 * have been changed under us before we read c->freelist and we
+	 * would not be able to detect that situation.
+	 */
+	smp_rmb();
+	if (unlikely(page != c->page || !freelist))
+		return __slab_free(s, page, x, addr, c->offset);
+
+	object[c->offset] = freelist;
+	if (cmpxchg_local(&c->freelist, freelist, object) != freelist)
+		goto redo;
 
-	local_irq_restore(flags);
+	preempt_enable();
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/