SLUB: slab defragmentation trigger At some point slab defragmentation needs to be triggered. The logical point for this is after slab shrinking was performed in vmscan.c. At that point the fragmentation ratio of a slab was increased by objects being freed. So we call kmem_cache_defrag from there. kmem_cache_defrag takes the defrag ratio to make the decision to defrag a slab or not. We define a new VM tunable slab_defrag_ratio that contains the limit to trigger slab defragmentation. Signed-off-by: Christoph Lameter --- Documentation/sysctl/vm.txt | 25 +++++++++++++++++++++++++ Documentation/vm/slabinfo.c | 39 ++++++++++++++++++++++++++++++++++----- include/linux/slab.h | 1 + kernel/sysctl.c | 10 ++++++++++ mm/vmscan.c | 7 +++++++ 5 files changed, 77 insertions(+), 5 deletions(-) Index: slub/Documentation/sysctl/vm.txt =================================================================== --- slub.orig/Documentation/sysctl/vm.txt 2007-06-05 17:51:10.000000000 -0700 +++ slub/Documentation/sysctl/vm.txt 2007-06-05 17:51:20.000000000 -0700 @@ -35,6 +35,7 @@ Currently, these files are in /proc/sys/ - swap_prefetch - swap_prefetch_delay - swap_prefetch_sleep +- slab_defrag_ratio ============================================================== @@ -300,3 +301,27 @@ sleep for when the ram is found to be fu further. The default value is 5. + +============================================================== + +slab_defrag_ratio + +After shrinking the slabs the system checks if slabs have a lower usage +ratio than the percentage given here. If so then slab defragmentation is +activated to increase the usage ratio of the slab and in order to free +memory. + +This is the percentage of objects allocated of the total possible number +of objects in a slab. A lower percentage signifies more fragmentation. + +Note slab defragmentation only works on slabs that have the proper methods +defined (see /sys/slab//ops). When this text was written slab +defragmentation was only supported by the dentry cache and the inode cache. + +The main purpose of the slab defragmentation is to address pathological +situations in which large amounts of inodes or dentries have been +removed from the system. That may leave lots of slabs around with just +a few objects. Slab defragmentation removes these slabs. + +The default value is 30% meaning for 3 items in use we have 7 free +and unused items. Index: slub/include/linux/slab.h =================================================================== --- slub.orig/include/linux/slab.h 2007-06-05 17:51:16.000000000 -0700 +++ slub/include/linux/slab.h 2007-06-05 17:51:20.000000000 -0700 @@ -85,6 +85,7 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); +int kmem_cache_defrag(int percentage); /* * Please use this macro to create slab caches. Simply specify the Index: slub/kernel/sysctl.c =================================================================== --- slub.orig/kernel/sysctl.c 2007-06-05 17:51:10.000000000 -0700 +++ slub/kernel/sysctl.c 2007-06-05 17:51:20.000000000 -0700 @@ -81,6 +81,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int sysctl_slab_defrag_ratio; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -917,6 +918,15 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slab_defrag_ratio", + .data = &sysctl_slab_defrag_ratio, + .maxlen = sizeof(sysctl_slab_defrag_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { .ctl_name = VM_LEGACY_VA_LAYOUT, Index: slub/mm/vmscan.c =================================================================== --- slub.orig/mm/vmscan.c 2007-06-05 17:51:10.000000000 -0700 +++ slub/mm/vmscan.c 2007-06-05 17:51:20.000000000 -0700 @@ -135,6 +135,12 @@ void unregister_shrinker(struct shrinker EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 + +/* + * Slabs should be defragmented if less than 30% of objects are allocated. + */ +int sysctl_slab_defrag_ratio = 30; + /* * Call the shrink functions to age shrinkable caches * @@ -218,6 +224,7 @@ unsigned long shrink_slab(unsigned long shrinker->nr += total_scan; } up_read(&shrinker_rwsem); + kmem_cache_defrag(sysctl_slab_defrag_ratio); return ret; } Index: slub/Documentation/vm/slabinfo.c =================================================================== --- slub.orig/Documentation/vm/slabinfo.c 2007-06-05 17:51:25.000000000 -0700 +++ slub/Documentation/vm/slabinfo.c 2007-06-05 18:07:08.000000000 -0700 @@ -30,6 +30,7 @@ struct slabinfo { int hwcache_align, object_size, objs_per_slab; int sanity_checks, slab_size, store_user, trace; int order, poison, reclaim_account, red_zone; + int defrag, ctor; unsigned long partial, objects, slabs; int numa[MAX_NODES]; int numa_partial[MAX_NODES]; @@ -56,6 +57,8 @@ int show_slab = 0; int skip_zero = 1; int show_numa = 0; int show_track = 0; +int show_defrag = 0; +int show_ctor = 0; int show_first_alias = 0; int validate = 0; int shrink = 0; @@ -90,18 +93,20 @@ void fatal(const char *x, ...) void usage(void) { printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n" - "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" + "slabinfo [-aCDefhilnosSrtTvz1] [-d debugopts] [slab-regexp]\n" "-a|--aliases Show aliases\n" + "-C|--ctor Show slabs with ctors\n" "-d|--debug= Set/Clear Debug options\n" - "-e|--empty Show empty slabs\n" + "-D|--defrag Show defragmentable caches\n" + "-e|--empty Show empty slabs\n" "-f|--first-alias Show first alias\n" "-h|--help Show usage information\n" "-i|--inverted Inverted list\n" "-l|--slabs Show slabs\n" "-n|--numa Show NUMA information\n" - "-o|--ops Show kmem_cache_ops\n" + "-o|--ops Show kmem_cache_ops\n" "-s|--shrink Shrink slabs\n" - "-r|--report Detailed report on single slabs\n" + "-r|--report Detailed report on single slabs\n" "-S|--Size Sort by size\n" "-t|--tracking Show alloc/free information\n" "-T|--Totals Show summary information\n" @@ -452,6 +457,12 @@ void slabcache(struct slabinfo *s) if (show_empty && s->slabs) return; + if (show_defrag && !s->defrag) + return; + + if (show_ctor && !s->ctor) + return; + store_size(size_str, slab_size(s)); sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs); @@ -462,6 +473,10 @@ void slabcache(struct slabinfo *s) *p++ = '*'; if (s->cache_dma) *p++ = 'd'; + if (s->defrag) + *p++ = 'D'; + if (s->ctor) + *p++ = 'C'; if (s->hwcache_align) *p++ = 'A'; if (s->poison) @@ -1072,6 +1087,12 @@ void read_slab_dir(void) slab->store_user = get_obj("store_user"); slab->trace = get_obj("trace"); chdir(".."); + if (read_slab_obj(slab, "ops")) { + if (strstr(buffer, "ctor :")) + slab->ctor = 1; + if (strstr(buffer, "kick :")) + slab->defrag = 1; + } if (slab->name[0] == ':') alias_targets++; slab++; @@ -1121,7 +1142,9 @@ void output_slabs(void) struct option opts[] = { { "aliases", 0, NULL, 'a' }, + { "ctor", 0, NULL, 'C' }, { "debug", 2, NULL, 'd' }, + { "defrag", 0, NULL, 'D' }, { "empty", 0, NULL, 'e' }, { "first-alias", 0, NULL, 'f' }, { "help", 0, NULL, 'h' }, @@ -1146,7 +1169,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS", + while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzCDTS", opts, NULL)) != -1) switch(c) { case '1': @@ -1196,6 +1219,12 @@ int main(int argc, char *argv[]) case 'z': skip_zero = 0; break; + case 'C': + show_ctor = 1; + break; + case 'D': + show_defrag = 1; + break; case 'T': show_totals = 1; break;