To: akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: dgc@sgi.com Subject: Slab defragmentation V4 V3->V4: - Optimize scan for slabs that need defragmentation - Add /sys/slab/*/defrag_ratio to allow setting defrag limits per slab. - Add support for buffer heads. - Describe how the cleanup after the daily updatedb can be improved by slab defragmentation. V2->V3 - Support directory reclaim - Add infrastructure to trigger defragmentation after slab shrinking if we have slabs with a high degree of fragmentation. V1->V2 - Clean up control flow using a state variable. Simplify API. Back to 2 functions that now take arrays of objects. - Inode defrag support for a set of filesystems - Fix up dentry defrag support to work on negative dentries by adding a new dentry flag that indicates that a dentry is not in the process of being freed or allocated. Slab defragmentation is useful to increase the object density in slab caches. On reclaim the fragmentation ratios will be checked and if a the object density (ratio between maximum objects that could be stored in the allocated slabs and the actuall objects in use) in a defragmentable slab is less than a certain percentage (defaults to 30%) then the slabs with the lowest number of objects in them will be freed which increases the object density. Currently supported are 1. dentry defrag 2. inode defrag (with a generic interface to allow easy setup of more filesystems than the currently supported ext2/3/4 reiserfs, XFS and proc) 3. buffer_heads One typical mechanism that triggers slab defragmentation on my systems is the daily run of updatedb Updatedb scans all files on the system which causes a high inode and dentry use. After updatedb is complete we need to go back to the regular use patterns (typical on my machine: kernel compiles). Those need the memory now for different purposes. The inodes and dentries used for updatedb will gradually be aged by the dentry/inode reclaim algorithm which will free up the dentries and inode entries randomly through the slabs that were allocated. As a result the slabs will become sparsely populated. If they become empty then they can be freed but a lot of them will remain sparsely populated. That is where slab defrag comes in: It removes the slabs with just a few entries reclaiming more memory for other uses. Currently slab reclaim writes messages like this to the syslog if slab defrag is occurring: Test results (see appended scripts / user space code for more data) (3 level tree with 10 entries at first level , 20 at the second and 30 files at the third level. Files at the lowest level were removed to create inode fragmentation) %Ra is the allocation ratio (need to apply the slabinfo patch to get those numbers) inode reclaim in reiserfs Name Objects Objsize Space Slabs/Part/Cpu O/S O %Ra %Ef Flg dentry 14660 200 3.0M 733/0/1 20 0 100 97 Da reiser_inode_cache 1596 640 4.1M 256/201/1 25 2 24 24 DCa Status after defrag Name Objects Objsize Space Slabs/Part/Cpu O/S O %Ra %Ef Flg dentry 8849 200 1.8M 454/17/1 20 0 97 95 Da reiser_inode_cache 1381 640 1.0M 65/11/0 25 2 84 82 DCa Slab defragmentation can be triggered in two ways: 1. Manually by running slabinfo -s or manually by the kernel calling kmem_cache_shrink(slab) (Currently only ACPI is doing such a call to a slab that has no defragmentation support. In that case we simply do what SLAB does: drop per cpu caches and sift through partial list for free slabs). 2. Automatically if defragmentable slabs reach a certain degree of fragmentation. The point where slab defragmentation occurs is can be set at /proc/sys/vm/slab_defrag_ratio Slab fragmentation is measured by how much of the possible objects in a slab are in use. The default setting for slab_defrag_ratio is 30%. This means that slab fragmentation is going to be triggered if there are more than 3 free object slots for each allocated object. Setting the slab_defrag_ratio higher will cause more defragmentation runs. If slab_defrag_ratio is set to 0 then no slab defragmentation occurs. Slabs are checked for their fragmentation levels after the slabs have been shrunk by running shrinkers in vm/scan.c during memory reclaim. This means that slab defragmentation is only triggered if we are under memory pressure and if there is significant slab fragmentation. Test script: #!/bin/sh echo 30 >/proc/sys/vm/slab_defrag_ratio ./gazfiles c 3 10 20 30 echo "Status before" slabinfo -D ./gazfiles d 2 echo "Status after removing files" slabinfo -D slabinfo -s echo "Status after defrag" slabinfo -D ./gazfiles d 0 gazfiles.c : /* * Create a gazillion of files to be able to create slab fragmentation * * (C) 2007 sgi, Christoph Lameter * * Create a n layered hierachy of files of empty files * * gazfiles ... * * gazfiles c[reate] 3 50 50 50 * * gazfiles s[hrink] * * gazfiles r[andomkill] */ #include #include #include #include #include #include #include #include #include #include #include #define MAXIMUM_LEVELS 10 int level; int sizes[MAXIMUM_LEVELS]; void fatal(const char *x, ...) { va_list ap; va_start(ap, x); vfprintf(stderr, x, ap); va_end(ap); exit(1); } int read_gaz(void) { FILE *f = fopen(".gazinfo", "r"); int rc = 0; int i; if (!f) return 0; if (!fscanf(f, "%d", &level)) goto out; if (level >= MAXIMUM_LEVELS) goto out; for (i = 0; i < level; i++) if (!fscanf(f, " %d", &sizes[i])) goto out; rc = 1; out: fclose(f); return rc; } void write_gaz(void) { FILE *f = fopen(".gazinfo","w"); int i; fprintf(f, "%d",level); for (i = 0; i < level; i++) fprintf(f," %d", sizes[i]); fprintf(f, "\n"); fclose(f); } void cre(int l) { int i; for (i = 0; i < sizes[l - 1]; i++) { char name[20]; sprintf(name, "%03d", i); if (l < level) { mkdir(name, 0775); chdir(name); cre(l + 1); chdir(".."); } else { FILE *f; f = fopen(name,"w"); fprintf(f, "Test"); fclose(f); } } } void create(int l, char **sz) { int i; level = l; for (i = 0; i < level; i++) sizes[i] = atoi(sz[i]); if (mkdir("gazf", 0775)) fatal("Cannot create gazf here\n"); chdir("gazf"); write_gaz(); cre(1); chdir(".."); } void shrink(int level) { if (chdir("gazf")) fatal("No gazfiles in this directory"); read_gaz(); chdir(".."); } void scand(int l, void (*func)(int, int, char *, unsigned long), unsigned long level) { DIR *dir; struct dirent *de; dir = opendir("."); if (!dir) fatal("Cannot open directory"); while ((de = readdir(dir))) { struct stat s; if (de->d_name[0] == '.') continue; /* * Some idiot broke the glibc library or made it impossible * to figure out how to make readdir work right */ stat(de->d_name, &s); if (S_ISDIR(s.st_mode)) de->d_type = DT_DIR; if (de->d_type == DT_DIR) { if (chdir(de->d_name)) fatal("Cannot enter %s", de->d_name); scand(l + 1, func, level); chdir(".."); func(l, 1, de->d_name, level); } else { func(l, 0, de->d_name, level); } } closedir(dir); } void traverse(void (*func)(int, int, char *, unsigned long), unsigned long level) { if (chdir("gazf")) fatal("No gazfiles in this directory"); scand(1, func, level); chdir(".."); } void randomkill(int nr) { if (chdir("gazf")) fatal("No gazfiles in this directory"); read_gaz(); chdir(".."); } void del_func(int l, int dir, char *name, unsigned long level) { if (l <= level) return; if (dir) { if (rmdir(name)) fatal("Cannot remove directory %s"); } else { if (unlink(name)) fatal("Cannot unlink file %s"); } } void delete(int l) { if (l == 0) { system("rm -rf gazf"); return; } traverse(del_func, l); } void usage(void) { printf("gazfiles: Tool to manage gazillions of files\n\n"); printf("gazfiles create <#l1> <#l2> ...\n"); printf("gazfiles delete \n"); printf("gazfiles shrink \n"); printf("gazfiles randomkill \n\n"); printf("(C) 2007 sgi, Christoph Lameter \n"); exit(0); } int main(int argc, char *argv[]) { if (argc < 2) usage(); switch (argv[1][0]) { case 'c' : create(atoi(argv[2]), argv + 3); break; case 's' : if (argc != 3) usage(); shrink(atoi(argv[2])); break; case 'r' : if (argc != 3) usage(); randomkill(atoi(argv[2])); break; case 'd': if (argc != 3) usage(); delete(atoi(argv[2])); break; default: usage(); } return 0; }