At some point slab defragmentation needs to be triggered. The logical point for this is after slab shrinking was performed in vmscan.c. At that point the fragmentation ratio of a slab was increased by objects being freed. So we call kmem_cache_defrag from there. kmem_cache_defrag takes the defrag ratio to make the decision to defrag a slab or not. We define a new VM tunable slab_defrag_ratio that contains the limit to trigger slab defragmentation. slab_shrink() from vmscan.c is called in some contexts to do global shrinking of slabs and in others to do shrinking for a particular zone. Pass the node number of the zone to slab_shrink, so that slab_shrink can call kmem_cache_defrag() and restrict the defragmentation to the node that is under memory pressure. Signed-off-by: Christoph Lameter --- Documentation/sysctl/vm.txt | 25 +++++++++++++++++++++++++ fs/drop_caches.c | 2 +- include/linux/mm.h | 2 +- include/linux/slab.h | 1 + kernel/sysctl.c | 10 ++++++++++ mm/vmscan.c | 34 +++++++++++++++++++++++++++------- 6 files changed, 65 insertions(+), 9 deletions(-) Index: slub/Documentation/sysctl/vm.txt =================================================================== --- slub.orig/Documentation/sysctl/vm.txt 2007-06-07 14:20:48.000000000 -0700 +++ slub/Documentation/sysctl/vm.txt 2007-06-07 14:22:35.000000000 -0700 @@ -35,6 +35,7 @@ Currently, these files are in /proc/sys/ - swap_prefetch - swap_prefetch_delay - swap_prefetch_sleep +- slab_defrag_ratio ============================================================== @@ -300,3 +301,27 @@ sleep for when the ram is found to be fu further. The default value is 5. + +============================================================== + +slab_defrag_ratio + +After shrinking the slabs the system checks if slabs have a lower usage +ratio than the percentage given here. If so then slab defragmentation is +activated to increase the usage ratio of the slab and in order to free +memory. + +This is the percentage of objects allocated of the total possible number +of objects in a slab. A lower percentage signifies more fragmentation. + +Note slab defragmentation only works on slabs that have the proper methods +defined (see /sys/slab//ops). When this text was written slab +defragmentation was only supported by the dentry cache and the inode cache. + +The main purpose of the slab defragmentation is to address pathological +situations in which large amounts of inodes or dentries have been +removed from the system. That may leave lots of slabs around with just +a few objects. Slab defragmentation removes these slabs. + +The default value is 30% meaning for 3 items in use we have 7 free +and unused items. Index: slub/include/linux/slab.h =================================================================== --- slub.orig/include/linux/slab.h 2007-06-07 14:20:48.000000000 -0700 +++ slub/include/linux/slab.h 2007-06-07 14:22:35.000000000 -0700 @@ -85,6 +85,7 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); +int kmem_cache_defrag(int percentage, int node); /* * Please use this macro to create slab caches. Simply specify the Index: slub/kernel/sysctl.c =================================================================== --- slub.orig/kernel/sysctl.c 2007-06-07 14:20:48.000000000 -0700 +++ slub/kernel/sysctl.c 2007-06-07 14:22:35.000000000 -0700 @@ -81,6 +81,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int sysctl_slab_defrag_ratio; extern int audit_argv_kb; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -928,6 +929,15 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_ratio", + .data = &sysctl_slab_defrag_ratio, + .maxlen = sizeof(sysctl_slab_defrag_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { .ctl_name = VM_LEGACY_VA_LAYOUT, Index: slub/mm/vmscan.c =================================================================== --- slub.orig/mm/vmscan.c 2007-06-07 14:20:48.000000000 -0700 +++ slub/mm/vmscan.c 2007-06-07 14:25:25.000000000 -0700 @@ -135,6 +135,12 @@ void unregister_shrinker(struct shrinker EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +/* + * Slabs should be defragmented if less than 30% of objects are allocated. + */ +int sysctl_slab_defrag_ratio = 30; + /* * Call the shrink functions to age shrinkable caches * @@ -152,10 +158,19 @@ EXPORT_SYMBOL(unregister_shrinker); * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. * + * zone is the zone for which we are shrinking the slabs. If the intent + * is to do a global shrink then zone can be be NULL. This is currently + * only used to limit slab defragmentation to a NUMA node. The performace + * of shrink_slab would be better (in particular under NUMA) if it could + * be targeted as a whole to a zone that is under memory pressure but + * the VFS datastructures do not allow that at the present time. As a + * result zone_reclaim must perform global slab reclaim in order + * to free up memory in a zone. + * * Returns the number of slab objects which we shrunk. */ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages) + unsigned long lru_pages, struct zone *zone) { struct shrinker *shrinker; unsigned long ret = 0; @@ -218,6 +233,8 @@ unsigned long shrink_slab(unsigned long shrinker->nr += total_scan; } up_read(&shrinker_rwsem); + kmem_cache_defrag(sysctl_slab_defrag_ratio, + zone ? zone_to_nid(zone) : -1); return ret; } @@ -1163,7 +1180,8 @@ unsigned long try_to_free_pages(struct z if (!priority) disable_swap_token(); nr_reclaimed += shrink_zones(priority, zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + shrink_slab(sc.nr_scanned, gfp_mask, lru_pages, + NULL); if (reclaim_state) { nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; @@ -1333,7 +1351,7 @@ loop_again: nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, - lru_pages); + lru_pages, zone); nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) @@ -1601,7 +1619,7 @@ unsigned long shrink_all_memory(unsigned /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL); if (!reclaim_state.reclaimed_slab) break; @@ -1639,7 +1657,7 @@ unsigned long shrink_all_memory(unsigned reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + count_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1656,7 +1674,8 @@ unsigned long shrink_all_memory(unsigned if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, + count_lru_pages(), NULL); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -1816,7 +1835,8 @@ static int __zone_reclaim(struct zone *z * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + while (shrink_slab(sc.nr_scanned, gfp_mask, order, + zone) && zone_page_state(zone, NR_SLAB_RECLAIMABLE) > slab_reclaimable - nr_pages) ; Index: slub/fs/drop_caches.c =================================================================== --- slub.orig/fs/drop_caches.c 2007-06-07 14:20:48.000000000 -0700 +++ slub/fs/drop_caches.c 2007-06-07 14:22:35.000000000 -0700 @@ -52,7 +52,7 @@ void drop_slab(void) int nr_objects; do { - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL); } while (nr_objects > 10); } Index: slub/include/linux/mm.h =================================================================== --- slub.orig/include/linux/mm.h 2007-06-07 14:20:48.000000000 -0700 +++ slub/include/linux/mm.h 2007-06-07 14:22:35.000000000 -0700 @@ -1240,7 +1240,7 @@ int in_gate_area_no_task(unsigned long a int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, - unsigned long lru_pages); + unsigned long lru_pages, struct zone *zone); extern void drop_pagecache_sb(struct super_block *); void drop_pagecache(void); void drop_slab(void); -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/