This is a bit of a different tack on things than the last version provided by Matheiu. Instead of using a cmpxchg we keep a state variable in the per cpu structure that is incremented when we enter the hot path. We can then detect that a thread is in the fastpath and fall back to alternate allocation / free technique that bypasses fastpath caching. A disadvantage is that we have to disable preempt. But if preemt is disabled (like on most kernels that I run) then the hotpath becomes very efficient. Cc: Mathieu Desnoyers Cc: Pekka Enberg Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 1 mm/slub.c | 91 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 18 deletions(-) Index: linux-2.6/include/linux/slub_def.h =================================================================== --- linux-2.6.orig/include/linux/slub_def.h 2009-10-01 15:53:15.000000000 -0500 +++ linux-2.6/include/linux/slub_def.h 2009-10-01 15:53:15.000000000 -0500 @@ -38,6 +38,7 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to first free per cpu object */ struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ + int active; /* Active fastpaths */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2009-10-01 15:53:15.000000000 -0500 +++ linux-2.6/mm/slub.c 2009-10-01 15:53:15.000000000 -0500 @@ -1606,7 +1606,14 @@ static void *__slab_alloc(struct kmem_ca unsigned long addr) { void **object; - struct page *page = __this_cpu_read(s->cpu_slab->page); + struct page *page; + unsigned long flags; + int hotpath; + + local_irq_save(flags); + preempt_enable(); /* Get rid of count */ + hotpath = __this_cpu_read(s->cpu_slab->active) != 0; + page = __this_cpu_read(s->cpu_slab->page); /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1626,13 +1633,21 @@ load_freelist: goto another_slab; if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; - - __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); - page->inuse = page->objects; - page->freelist = NULL; - __this_cpu_write(s->cpu_slab->node, page_to_nid(page)); + if (unlikely(hotpath)) { + /* Object on second free list available and hotpath busy */ + page->inuse++; + page->freelist = get_freepointer(s, object); + } else { + /* Prepare new list of objects for hotpath */ + __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); + page->inuse = page->objects; + page->freelist = NULL; + __this_cpu_write(s->cpu_slab->node, page_to_nid(page)); + } unlock_out: + __this_cpu_dec(s->cpu_slab->active); slab_unlock(page); + local_irq_restore(flags); stat(s, ALLOC_SLOWPATH); return object; @@ -1642,8 +1657,12 @@ another_slab: new_slab: page = get_partial(s, gfpflags, node); if (page) { - __this_cpu_write(s->cpu_slab->page, page); stat(s, ALLOC_FROM_PARTIAL); + + if (hotpath) + goto hot_lock; + + __this_cpu_write(s->cpu_slab->page, page); goto load_freelist; } @@ -1657,6 +1676,10 @@ new_slab: if (page) { stat(s, ALLOC_SLAB); + + if (hotpath) + goto hot_no_lock; + if (__this_cpu_read(s->cpu_slab->page)) flush_slab(s, __this_cpu_ptr(s->cpu_slab)); slab_lock(page); @@ -1664,6 +1687,10 @@ new_slab: __this_cpu_write(s->cpu_slab->page, page); goto load_freelist; } + + __this_cpu_dec(s->cpu_slab->active); + local_irq_restore(flags); + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); return NULL; @@ -1675,6 +1702,19 @@ debug: page->freelist = get_freepointer(s, object); __this_cpu_write(s->cpu_slab->node, -1); goto unlock_out; + + /* + * Hotpath is busy and we need to avoid touching + * hotpath variables + */ +hot_no_lock: + slab_lock(page); +hot_lock: + __ClearPageSlubFrozen(page); + if (get_freepointer(s, page->freelist)) + /* Cannot put page into the hotpath. Instead back to partial */ + add_partial(get_node(s, page_to_nid(page)), page, 0); + goto load_freelist; } /* @@ -1691,7 +1731,6 @@ static __always_inline void *slab_alloc( gfp_t gfpflags, int node, unsigned long addr) { void **object; - unsigned long flags; gfpflags &= gfp_allowed_mask; @@ -1701,19 +1740,21 @@ static __always_inline void *slab_alloc( if (should_failslab(s->objsize, gfpflags)) return NULL; - local_irq_save(flags); + preempt_disable(); + irqsafe_cpu_inc(s->cpu_slab->active); object = __this_cpu_read(s->cpu_slab->freelist); - if (unlikely(!object || !node_match(s, node))) + if (unlikely(!object || !node_match(s, node) || + __this_cpu_read(s->cpu_slab->active))) object = __slab_alloc(s, gfpflags, node, addr); else { __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); + irqsafe_cpu_dec(s->cpu_slab->active); + preempt_enable(); stat(s, ALLOC_FASTPATH); } - local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, s->objsize); @@ -1777,6 +1818,11 @@ static void __slab_free(struct kmem_cach { void *prior; void **object = (void *)x; + unsigned long flags; + + local_irq_save(flags); + preempt_enable(); /* Fix up count */ + __this_cpu_dec(s->cpu_slab->active); stat(s, FREE_SLOWPATH); slab_lock(page); @@ -1809,6 +1855,7 @@ checks_ok: out_unlock: slab_unlock(page); + local_irq_restore(flags); return; slab_empty: @@ -1820,6 +1867,7 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); + local_irq_restore(flags); stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1845,24 +1893,26 @@ static __always_inline void slab_free(st struct page *page, void *x, unsigned long addr) { void **object = (void *)x; - unsigned long flags; kmemleak_free_recursive(x, s->flags); - local_irq_save(flags); kmemcheck_slab_free(s, object, s->objsize); debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, s->objsize); + preempt_disable(); + irqsafe_cpu_inc(s->cpu_slab->active); if (likely(page == __this_cpu_read(s->cpu_slab->page) && - __this_cpu_read(s->cpu_slab->node) >= 0)) { - set_freepointer(s, object, __this_cpu_read(s->cpu_slab->freelist)); + __this_cpu_read(s->cpu_slab->node) >= 0) && + !__this_cpu_read(s->cpu_slab->active)) { + set_freepointer(s, object, + __this_cpu_read(s->cpu_slab->freelist)); __this_cpu_write(s->cpu_slab->freelist, object); + irqsafe_cpu_dec(s->cpu_slab->active); + preempt_enable(); stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); - - local_irq_restore(flags); } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -2064,6 +2114,8 @@ static DEFINE_PER_CPU(struct kmem_cache_ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { + int cpu; + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data @@ -2073,6 +2125,9 @@ static inline int alloc_kmem_cache_cpus( else s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->active = -1; + if (!s->cpu_slab) return 0; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/