Optimize RCU handling for struct file by relying on slab RCU handling Dave Miller Jens Axboe Ingo Molnar Hugh Dickins Currently we schedule RCU frees for each file we free separately. That has several drawbacks against the earlier file handling (in 2.6.5 f.e.) that did not require RCU callbacks. 1. Excessive number of RCU callbacks can be generated causing long RCU queues that in turn cause long latencies. 2. The cache hot object is not preserved between free and realloc. A close followed by another open is very fast with the RCU less approach because the last freed object is returned by the slab allocator that is still cache hot. RCU free means that the object is not immediately available again. The new object is cache cold and therefore open/close performance tests show a significant degradation with the RCU implementation. 3. Higher rates of RCU processing mean higher scheduling latencies. One solution to this problem is to move the RCU freeing into the Slab allocator by specifying SLAB_DESTROY_BY_RCU as an option at slab creation time. The slab allocator will do RCU frees only when it is necessary to dispose of slabs of objects (rare). So with that approach we can cut out the RCU overhead significantly. However, the slab allocator may return the object for another use even before the RCU period has expired under SLAB_DESTROY_BY_RCU. This means there is the (unlikely) possibility that the object is going to be switched under us in sections protected by rcu_read_lock() and rcu_read_unlock(). So we need to verify that we have acquired the correct object after establishing a stable object reference (incrementing the refcounter does that). I have tested this on IA64 NUMA with aim7. Patch was first discussed at: http://marc.theaimsgroup.com/?t=115048747200002&r=1&w=2 V1->V2: - Consolidate common code after finding fget in fget_light. - Rename fu_list to its prior name f_list. - Do not clear variables in get_empty_filp() that are cleared later by other subsystems Signed-off-by: Christoph Lameter Index: linux-2.6.17/include/linux/fs.h =================================================================== --- linux-2.6.17.orig/include/linux/fs.h 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/include/linux/fs.h 2006-06-22 12:24:42.017399331 -0700 @@ -252,6 +252,7 @@ extern void __init inode_init(unsigned l extern void __init inode_init_early(void); extern void __init mnt_init(unsigned long); extern void __init files_init(unsigned long); +extern void __init files_init_early(void); struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, @@ -630,14 +631,7 @@ struct file_ra_state { #define RA_FLAG_INCACHE 0x02 /* file is already in cache */ struct file { - /* - * fu_list becomes invalid after file_free is called and queued via - * fu_rcuhead for RCU freeing - */ - union { - struct list_head fu_list; - struct rcu_head fu_rcuhead; - } f_u; + struct list_head f_list; struct dentry *f_dentry; struct vfsmount *f_vfsmnt; const struct file_operations *f_op; Index: linux-2.6.17/fs/file_table.c =================================================================== --- linux-2.6.17.orig/fs/file_table.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/fs/file_table.c 2006-06-22 13:01:32.552223231 -0700 @@ -35,16 +35,33 @@ __cacheline_aligned_in_smp DEFINE_SPINLO static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void filp_constructor(void *data, struct kmem_cache *cachep, + unsigned long flags) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + struct file *f = data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) != + SLAB_CTOR_CONSTRUCTOR) + return; + + memset(f, 0, sizeof(*f)); + INIT_LIST_HEAD(&f->f_list); + atomic_set(&f->f_count, 0); + rwlock_init(&f->f_owner.lock); + eventpoll_init_file(f); +} + +void __init files_init_early(void) +{ + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + filp_constructor, NULL); } static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + kmem_cache_free(filp_cachep, f); } /* @@ -109,18 +126,15 @@ struct file *get_empty_filp(void) goto fail; percpu_counter_inc(&nr_files); - memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; - tsk = current; - INIT_LIST_HEAD(&f->f_u.fu_list); - atomic_set(&f->f_count, 1); - rwlock_init(&f->f_owner.lock); f->f_uid = tsk->fsuid; f->f_gid = tsk->fsgid; - eventpoll_init_file(f); - /* f->f_version: 0 */ + f->f_owner.signum = 0; + f->f_version = 0; + f->private_data = NULL; + atomic_inc(&f->f_count); /* We reached a definite state */ return f; over: @@ -196,6 +210,14 @@ struct file fastcall *fget(unsigned int rcu_read_unlock(); return NULL; } + /* + * Now we have a stable reference to an object. + * Check if RCU switched it from under us. + */ + if (unlikely(file != fcheck_files(files, fd))) { + put_filp(file); + file = NULL; + } } rcu_read_unlock(); @@ -220,18 +242,10 @@ struct file fastcall *fget_light(unsigne if (likely((atomic_read(&files->count) == 1))) { file = fcheck_files(files, fd); } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + file = fget(fd); + if (file) + *fput_needed = 1; } - return file; } @@ -250,15 +264,15 @@ void file_move(struct file *file, struct if (!list) return; file_list_lock(); - list_move(&file->f_u.fu_list, list); + list_move(&file->f_list, list); file_list_unlock(); } void file_kill(struct file *file) { - if (!list_empty(&file->f_u.fu_list)) { + if (!list_empty(&file->f_list)) { file_list_lock(); - list_del_init(&file->f_u.fu_list); + list_del_init(&file->f_list); file_list_unlock(); } } @@ -270,7 +284,7 @@ int fs_may_remount_ro(struct super_block /* Check that no files are currently opened for writing. */ file_list_lock(); list_for_each(p, &sb->s_files) { - struct file *file = list_entry(p, struct file, f_u.fu_list); + struct file *file = list_entry(p, struct file, f_list); struct inode *inode = file->f_dentry->d_inode; /* File with pending delete? */ Index: linux-2.6.17/fs/dcache.c =================================================================== --- linux-2.6.17.orig/fs/dcache.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/fs/dcache.c 2006-06-22 12:25:07.133029860 -0700 @@ -1751,9 +1751,7 @@ void __init vfs_caches_init(unsigned lon names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - + files_init_early(); dcache_init(mempages); inode_init(mempages); files_init(mempages); Index: linux-2.6.17/drivers/char/tty_io.c =================================================================== --- linux-2.6.17.orig/drivers/char/tty_io.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/drivers/char/tty_io.c 2006-06-21 14:49:52.941370261 -0700 @@ -1046,7 +1046,7 @@ static void do_tty_hangup(void *data) check_tty_count(tty, "do_tty_hangup"); file_list_lock(); /* This breaks for file handles being sent over AF_UNIX sockets ? */ - list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { + list_for_each_entry(filp, &tty->tty_files, f_list) { if (filp->f_op->write == redirected_tty_write) cons_filp = filp; if (filp->f_op->write != tty_write) Index: linux-2.6.17/fs/dquot.c =================================================================== --- linux-2.6.17.orig/fs/dquot.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/fs/dquot.c 2006-06-21 14:49:52.942346763 -0700 @@ -693,7 +693,7 @@ static void add_dquot_ref(struct super_b restart: file_list_lock(); list_for_each(p, &sb->s_files) { - struct file *filp = list_entry(p, struct file, f_u.fu_list); + struct file *filp = list_entry(p, struct file, f_list); struct inode *inode = filp->f_dentry->d_inode; if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { struct dentry *dentry = dget(filp->f_dentry); Index: linux-2.6.17/security/selinux/selinuxfs.c =================================================================== --- linux-2.6.17.orig/security/selinux/selinuxfs.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/security/selinux/selinuxfs.c 2006-06-21 14:49:52.943323265 -0700 @@ -901,7 +901,7 @@ static void sel_remove_bools(struct dent file_list_lock(); list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + struct file * filp = list_entry(p, struct file, f_list); struct dentry * dentry = filp->f_dentry; if (dentry->d_parent != de) { Index: linux-2.6.17/fs/super.c =================================================================== --- linux-2.6.17.orig/fs/super.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/fs/super.c 2006-06-21 14:49:52.944299767 -0700 @@ -513,7 +513,7 @@ static void mark_files_ro(struct super_b struct file *f; file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + list_for_each_entry(f, &sb->s_files, f_list) { if (S_ISREG(f->f_dentry->d_inode->i_mode) && file_count(f)) f->f_mode &= ~FMODE_WRITE; } Index: linux-2.6.17/security/selinux/hooks.c =================================================================== --- linux-2.6.17.orig/security/selinux/hooks.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/security/selinux/hooks.c 2006-06-21 14:49:52.947229273 -0700 @@ -1611,7 +1611,7 @@ static inline void flush_unauthorized_fi if (tty) { file_list_lock(); - file = list_entry(tty->tty_files.next, typeof(*file), f_u.fu_list); + file = list_entry(tty->tty_files.next, typeof(*file), f_list); if (file) { /* Revalidate access to controlling tty. Use inode_has_perm on the tty inode directly rather Index: linux-2.6.17/fs/proc/generic.c =================================================================== --- linux-2.6.17.orig/fs/proc/generic.c 2006-06-17 18:49:35.000000000 -0700 +++ linux-2.6.17/fs/proc/generic.c 2006-06-21 14:49:52.948205776 -0700 @@ -557,7 +557,7 @@ static void proc_kill_inodes(struct proc */ file_list_lock(); list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + struct file * filp = list_entry(p, struct file, f_list); struct dentry * dentry = filp->f_dentry; struct inode * inode; const struct file_operations *fops;