[only for review, untested, waiting for Mel to get around to use this to improve memory compaction or antifragmentation] Special function kmem_cache_vacate() to push out the objects in a specified slab. In order to make that work we will have to handle slab page allocations in such a way that we can determine if a slab is valid whenever we access it regardless of its time in life. A valid slab that can be freed has PageSlab(page) and page->inuse > 0 set. So we need to make sure in allocate_slab that page->inuse is zero before PageSlab is set otherwise kmem_cache_vacate may operate on a slab that has not been properly setup yet. Signed-off-by: Christoph Lameter --- include/linux/slab.h | 1 mm/slab.c | 9 ++++ mm/slob.c | 9 ++++ mm/slub.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 117 insertions(+), 7 deletions(-) Index: linux-2.6.22-rc6-mm1/include/linux/slab.h =================================================================== --- linux-2.6.22-rc6-mm1.orig/include/linux/slab.h 2007-07-05 19:05:02.000000000 -0700 +++ linux-2.6.22-rc6-mm1/include/linux/slab.h 2007-07-05 19:05:08.000000000 -0700 @@ -97,6 +97,7 @@ unsigned int kmem_cache_size(struct kmem const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); int kmem_cache_defrag(int node); +int kmem_cache_vacate(struct page *); /* * Please use this macro to create slab caches. Simply specify the Index: linux-2.6.22-rc6-mm1/mm/slab.c =================================================================== --- linux-2.6.22-rc6-mm1.orig/mm/slab.c 2007-07-05 19:00:20.000000000 -0700 +++ linux-2.6.22-rc6-mm1/mm/slab.c 2007-07-05 19:05:08.000000000 -0700 @@ -2523,6 +2523,15 @@ int kmem_cache_defrag(int percent, int n return 0; } +/* + * SLAB does not support slab defragmentation + */ +int kmem_cache_vacate(struct page *page) +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_vacate); + /** * kmem_cache_destroy - delete a cache * @cachep: the cache to destroy Index: linux-2.6.22-rc6-mm1/mm/slob.c =================================================================== --- linux-2.6.22-rc6-mm1.orig/mm/slob.c 2007-07-05 19:00:20.000000000 -0700 +++ linux-2.6.22-rc6-mm1/mm/slob.c 2007-07-05 19:05:08.000000000 -0700 @@ -596,6 +596,15 @@ int kmem_cache_defrag(int percentage, in return 0; } +/* + * SLOB does not support slab defragmentation + */ +int kmem_cache_vacate(struct page *page) +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_vacate); + int kmem_ptr_validate(struct kmem_cache *a, const void *b) { return 0; Index: linux-2.6.22-rc6-mm1/mm/slub.c =================================================================== --- linux-2.6.22-rc6-mm1.orig/mm/slub.c 2007-07-05 19:01:48.000000000 -0700 +++ linux-2.6.22-rc6-mm1/mm/slub.c 2007-07-05 19:05:08.000000000 -0700 @@ -1041,6 +1041,7 @@ static inline int slab_pad_check(struct static inline int check_object(struct kmem_cache *s, struct page *page, void *object, int active) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} #define slub_debug 0 #endif @@ -1106,12 +1107,11 @@ static struct page *new_slab(struct kmem n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); + + page->inuse = 0; + page->lockless_freelist = NULL; page->offset = s->offset / sizeof(void *); page->slab = s; - page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); start = page_address(page); end = start + s->objects * s->size; @@ -1129,9 +1129,18 @@ static struct page *new_slab(struct kmem set_freepointer(s, last, NULL); page->freelist = start; - page->lockless_freelist = NULL; - page->inuse = 0; -out: + + /* + * page->inuse must be 0 when PageSlab(page) becomes + * true so that defrag knows that this slab is not in use. + */ + smp_wmb(); + __SetPageSlab(page); + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + SetSlabDebug(page); + + out: if (flags & __GFP_WAIT) local_irq_disable(); return page; @@ -2660,6 +2669,88 @@ static unsigned long __kmem_cache_shrink } /* + * Get a page off a list and freeze it. Must be holding slab lock. + */ +static void freeze_from_list(struct kmem_cache *s, struct page *page) +{ + if (page->inuse < s->objects) + remove_partial(s, page); + else if (s->flags & SLAB_STORE_USER) + remove_full(s, page); + SetSlabFrozen(page); +} + +/* + * Attempt to free objects in a page. Return 1 if succesful. + */ +int kmem_cache_vacate(struct page *page) +{ + unsigned long flags; + struct kmem_cache *s; + int vacated = 0; + void **vector = NULL; + + /* + * Get a reference to the page. Return if its freed or being freed. + * This is necessary to make sure that the page does not vanish + * from under us before we are able to check the result. + */ + if (!get_page_unless_zero(page)) + return 0; + + if (!PageSlab(page)) + goto out; + + s = page->slab; + if (!s) + goto out; + + vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL); + if (!vector) + goto out2; + + local_irq_save(flags); + /* + * The implicit memory barrier in slab_lock guarantees that page->inuse + * is loaded after PageSlab(page) has been established to be true. + * Only revelant for a newly created slab. + */ + slab_lock(page); + + /* + * We may now have locked a page that may be in various stages of + * being freed. If the PageSlab bit is off then we have already + * reached the page allocator. If page->inuse is zero then we are + * in SLUB but freeing or allocating the page. + * page->inuse is never modified without the slab lock held. + * + * Also abort if the page happens to be already frozen. If its + * frozen then a concurrent vacate may be in progress. + */ + if (!PageSlab(page) || SlabFrozen(page) || !page->inuse) + goto out_locked; + + /* + * We are holding a lock on a slab page and all operations on the + * slab are blocking. + */ + if (!s->ops->get || !s->ops->kick) + goto out_locked; + freeze_from_list(s, page); + vacated = __kmem_cache_vacate(s, page, flags, vector); +out: + kfree(vector); +out2: + put_page(page); + return vacated == 0; +out_locked: + slab_unlock(page); + local_irq_restore(flags); + goto out; + +} + +/* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the * most items in use come first. New allocations will then fill those up -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/