Index: linux-2.6/fs/super.c =================================================================== --- linux-2.6.orig/fs/super.c +++ linux-2.6/fs/super.c @@ -62,10 +62,41 @@ static struct super_block *alloc_super(s s = NULL; goto out; } +#ifdef CONFIG_SMP + s->s_files = alloc_percpu(struct list_head); + if (!s->s_files) { + security_sb_free(s); + kfree(s); + s = NULL; + goto out; + } else { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); + } +#else INIT_LIST_HEAD(&s->s_files); +#endif +#ifdef CONFIG_SMP + s->s_inodes = alloc_percpu(struct list_head); + if (!s->s_inodes) { + free_percpu(s->s_files); + security_sb_free(s); + kfree(s); + s = NULL; + goto out; + } else { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(per_cpu_ptr(s->s_inodes, i)); + } +#else + INIT_LIST_HEAD(&s->s_inodes); +#endif INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); @@ -117,6 +148,10 @@ out: */ static inline void destroy_super(struct super_block *s) { +#ifdef CONFIG_SMP + free_percpu(s->s_inodes); + free_percpu(s->s_files); +#endif security_sb_free(s); kfree(s->s_subtype); kfree(s->s_options); @@ -568,7 +603,7 @@ out: int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; - int remount_rw; + int remount_rw, remount_ro; if (sb->s_frozen != SB_UNFROZEN) return -EBUSY; @@ -583,9 +618,12 @@ int do_remount_sb(struct super_block *sb shrink_dcache_sb(sb); sync_filesystem(sb); + remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); + remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); + /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ - if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { + if (remount_ro) { if (force) mark_files_ro(sb); else if (!fs_may_remount_ro(sb)) @@ -594,7 +632,6 @@ int do_remount_sb(struct super_block *sb if (retval < 0 && retval != -ENOSYS) return -EBUSY; } - remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); @@ -604,6 +641,14 @@ int do_remount_sb(struct super_block *sb sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); if (remount_rw) vfs_dq_quota_on_remount(sb); + /* Some filesystems modify their metadata via some other path + than the bdev buffer cache (eg. use a private mapping, or + directories in pagecache, etc). Also file data modifications + go via their own mappings. So If we try to mount readonly + then copy the filesystem from bdev, we could get stale data, + so invalidate it to give a best effort at coherency. */ + if (remount_ro && sb->s_bdev) + invalidate_bdev(sb->s_bdev); return 0; } Index: linux-2.6/fs/pipe.c =================================================================== --- linux-2.6.orig/fs/pipe.c +++ linux-2.6/fs/pipe.c @@ -887,17 +887,6 @@ void free_pipe_info(struct inode *inode) } static struct vfsmount *pipe_mnt __read_mostly; -static int pipefs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} /* * pipefs_dname() is called from d_path(). @@ -909,7 +898,6 @@ static char *pipefs_dname(struct dentry } static const struct dentry_operations pipefs_dentry_operations = { - .d_delete = pipefs_delete_dentry, .d_dname = pipefs_dname, }; @@ -964,17 +952,12 @@ struct file *create_write_pipe(int flags goto err; err = -ENOMEM; - dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); + dentry = d_alloc(NULL, &name); if (!dentry) goto err_inode; - + dentry->d_parent = dentry; + dentry->d_flags |= DCACHE_DISCONNECTED; dentry->d_op = &pipefs_dentry_operations; - /* - * We dont want to publish this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on pipes - */ - dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(dentry, inode); err = -ENFILE; Index: linux-2.6/net/socket.c =================================================================== --- linux-2.6.orig/net/socket.c +++ linux-2.6/net/socket.c @@ -257,12 +257,19 @@ static struct inode *sock_alloc_inode(st return &ei->vfs_inode; } -static void sock_destroy_inode(struct inode *inode) +static void sock_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(sock_inode_cachep, container_of(inode, struct socket_alloc, vfs_inode)); } +static void sock_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, sock_i_callback); +} + static void init_once(void *foo) { struct socket_alloc *ei = (struct socket_alloc *)foo; @@ -306,18 +313,6 @@ static struct file_system_type sock_fs_t .kill_sb = kill_anon_super, }; -static int sockfs_delete_dentry(struct dentry *dentry) -{ - /* - * At creation time, we pretended this dentry was hashed - * (by clearing DCACHE_UNHASHED bit in d_flags) - * At delete time, we restore the truth : not hashed. - * (so that dput() can proceed correctly) - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 0; -} - /* * sockfs_dname() is called from d_path(). */ @@ -328,7 +323,6 @@ static char *sockfs_dname(struct dentry } static const struct dentry_operations sockfs_dentry_operations = { - .d_delete = sockfs_delete_dentry, .d_dname = sockfs_dname, }; @@ -372,17 +366,12 @@ static int sock_attach_fd(struct socket struct dentry *dentry; struct qstr name = { .name = "" }; - dentry = d_alloc(sock_mnt->mnt_sb->s_root, &name); + dentry = d_alloc(NULL, &name); if (unlikely(!dentry)) return -ENOMEM; - + dentry->d_parent = dentry; + dentry->d_flags |= DCACHE_DISCONNECTED; dentry->d_op = &sockfs_dentry_operations; - /* - * We dont want to push this dentry into global dentry hash table. - * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED - * This permits a working /proc/$pid/fd/XXX on sockets - */ - dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(dentry, SOCK_INODE(sock)); sock->file = file; Index: linux-2.6/fs/anon_inodes.c =================================================================== --- linux-2.6.orig/fs/anon_inodes.c +++ linux-2.6/fs/anon_inodes.c @@ -35,24 +35,11 @@ static int anon_inodefs_get_sb(struct fi mnt); } -static int anon_inodefs_delete_dentry(struct dentry *dentry) -{ - /* - * We faked vfs to believe the dentry was hashed when we created it. - * Now we restore the flag so that dput() will work correctly. - */ - dentry->d_flags |= DCACHE_UNHASHED; - return 1; -} - static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .get_sb = anon_inodefs_get_sb, .kill_sb = kill_anon_super, }; -static const struct dentry_operations anon_inodefs_dentry_operations = { - .d_delete = anon_inodefs_delete_dentry, -}; /* * nop .set_page_dirty method so that people can use .page_mkwrite on @@ -106,20 +93,21 @@ struct file *anon_inode_getfile(const ch this.name = name; this.len = strlen(name); this.hash = 0; - dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); + dentry = d_alloc(NULL, &this); if (!dentry) goto err_module; + dentry->d_parent = dentry; + dentry->d_flags |= DCACHE_DISCONNECTED; /* * We know the anon_inode inode count is always greater than zero, * so we can avoid doing an igrab() and we can use an open-coded * atomic_inc(). */ - atomic_inc(&anon_inode_inode->i_count); + spin_lock(&anon_inode_inode->i_lock); + anon_inode_inode->i_count++; + spin_unlock(&anon_inode_inode->i_lock); - dentry->d_op = &anon_inodefs_dentry_operations; - /* Do not publish this dentry inside the global dentry hash table */ - dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(dentry, anon_inode_inode); error = -ENFILE; Index: linux-2.6/fs/nfs/write.c =================================================================== --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -377,7 +377,7 @@ static int nfs_inode_add_request(struct error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); BUG_ON(error); if (!nfsi->npages) { - igrab(inode); + __iget(inode); if (nfs_have_delegation(inode, FMODE_WRITE)) nfsi->change_attr++; } Index: linux-2.6/drivers/char/pty.c =================================================================== --- linux-2.6.orig/drivers/char/pty.c +++ linux-2.6/drivers/char/pty.c @@ -654,7 +654,11 @@ static int __ptmx_open(struct inode *ino set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ filp->private_data = tty; - file_move(filp, &tty->tty_files); + + file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ + spin_lock(&tty_files_lock); + list_add(&filp->f_u.fu_list, &tty->tty_files); + spin_unlock(&tty_files_lock); retval = devpts_pty_new(inode, tty->link); if (retval) Index: linux-2.6/drivers/char/tty_io.c =================================================================== --- linux-2.6.orig/drivers/char/tty_io.c +++ linux-2.6/drivers/char/tty_io.c @@ -136,6 +136,9 @@ LIST_HEAD(tty_drivers); /* linked list DEFINE_MUTEX(tty_mutex); EXPORT_SYMBOL(tty_mutex); +/* Spinlock to protect the tty->tty_files list */ +DEFINE_SPINLOCK(tty_files_lock); + static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); ssize_t redirected_tty_write(struct file *, const char __user *, @@ -235,11 +238,11 @@ static int check_tty_count(struct tty_st struct list_head *p; int count = 0; - file_list_lock(); + spin_lock(&tty_files_lock); list_for_each(p, &tty->tty_files) { count++; } - file_list_unlock(); + spin_unlock(&tty_files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) @@ -517,7 +520,7 @@ static void do_tty_hangup(struct work_st spin_unlock(&redirect_lock); check_tty_count(tty, "do_tty_hangup"); - file_list_lock(); + spin_lock(&tty_files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { if (filp->f_op->write == redirected_tty_write) @@ -528,7 +531,7 @@ static void do_tty_hangup(struct work_st tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } - file_list_unlock(); + spin_unlock(&tty_files_lock); tty_ldisc_hangup(tty); @@ -1404,9 +1407,9 @@ static void release_one_tty(struct work_ tty_driver_kref_put(driver); module_put(driver->owner); - file_list_lock(); + spin_lock(&tty_files_lock); list_del_init(&tty->tty_files); - file_list_unlock(); + spin_unlock(&tty_files_lock); free_tty_struct(tty); } @@ -1630,7 +1633,10 @@ void tty_release_dev(struct file *filp) * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ - file_kill(filp); + spin_lock(&tty_files_lock); + BUG_ON(list_empty(&filp->f_u.fu_list)); + list_del_init(&filp->f_u.fu_list); + spin_unlock(&tty_files_lock); filp->private_data = NULL; /* @@ -1788,7 +1794,11 @@ got_driver: return PTR_ERR(tty); filp->private_data = tty; - file_move(filp, &tty->tty_files); + BUG_ON(list_empty(&filp->f_u.fu_list)); + file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ + spin_lock(&tty_files_lock); + list_add(&filp->f_u.fu_list, &tty->tty_files); + spin_unlock(&tty_files_lock); check_tty_count(tty, "tty_open"); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) Index: linux-2.6/fs/file_table.c =================================================================== --- linux-2.6.orig/fs/file_table.c +++ linux-2.6/fs/file_table.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -30,8 +31,7 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -/* public. Not pretty! */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +static DEFINE_PER_CPU(spinlock_t, files_cpulock); /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; @@ -285,7 +285,7 @@ void __fput(struct file *file) cdev_put(inode->i_cdev); fops_put(file->f_op); put_pid(file->f_owner.pid); - file_kill(file); + file_sb_list_del(file); if (file->f_mode & FMODE_WRITE) drop_file_write_access(file); file->f_path.dentry = NULL; @@ -347,55 +347,112 @@ struct file *fget_light(unsigned int fd, return file; } - void put_filp(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { security_file_free(file); - file_kill(file); + file_sb_list_del(file); file_free(file); } } -void file_move(struct file *file, struct list_head *list) +void file_sb_list_add(struct file *file, struct super_block *sb) { - if (!list) - return; - file_list_lock(); - list_move(&file->f_u.fu_list, list); - file_list_unlock(); + spinlock_t *lock; + struct list_head *list; +#ifdef CONFIG_SMP + int cpu; +#endif + + lock = &get_cpu_var(files_cpulock); +#ifdef CONFIG_SMP + cpu = smp_processor_id(); + list = per_cpu_ptr(sb->s_files, cpu); + file->f_sb_list_cpu = cpu; +#else + list = &sb->s_files; +#endif + spin_lock(lock); + BUG_ON(!list_empty(&file->f_u.fu_list)); + list_add(&file->f_u.fu_list, list); + spin_unlock(lock); + put_cpu_var(files_cpulock); } -void file_kill(struct file *file) +void file_sb_list_del(struct file *file) { if (!list_empty(&file->f_u.fu_list)) { - file_list_lock(); + spinlock_t *lock; + +#ifdef CONFIG_SMP + lock = &per_cpu(files_cpulock, file->f_sb_list_cpu); +#else + lock = &__get_cpu_var(files_cpulock); +#endif + spin_lock(lock); list_del_init(&file->f_u.fu_list); - file_list_unlock(); + spin_unlock(lock); + } +} + +static void file_list_lock_all(void) +{ + int i; + int nr = 0; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(files_cpulock, i); + spin_lock_nested(lock, nr); + nr++; + } +} + +static void file_list_unlock_all(void) +{ + int i; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(files_cpulock, i); + spin_unlock(lock); } } int fs_may_remount_ro(struct super_block *sb) { - struct file *file; + int i; /* Check that no files are currently opened for writing. */ - file_list_lock(); - list_for_each_entry(file, &sb->s_files, f_u.fu_list) { - struct inode *inode = file->f_path.dentry->d_inode; - - /* File with pending delete? */ - if (inode->i_nlink == 0) - goto too_bad; - - /* Writeable file? */ - if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) - goto too_bad; + file_list_lock_all(); + for_each_possible_cpu(i) { + struct file *file; + struct list_head *list; + +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_files, i); +#else + list = &sb->s_files; +#endif + list_for_each_entry(file, list, f_u.fu_list) { + struct inode *inode = file->f_path.dentry->d_inode; + + /* File with pending delete? */ + if (inode->i_nlink == 0) + goto too_bad; + + /* Writeable file? */ + if (S_ISREG(inode->i_mode) && + (file->f_mode & FMODE_WRITE)) + goto too_bad; + } } - file_list_unlock(); + file_list_unlock_all(); return 1; /* Tis' cool bro. */ too_bad: - file_list_unlock(); + file_list_unlock_all(); return 0; } @@ -408,38 +465,46 @@ too_bad: */ void mark_files_ro(struct super_block *sb) { - struct file *f; + int i; retry: - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - struct vfsmount *mnt; - if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) - continue; - if (!file_count(f)) - continue; - if (!(f->f_mode & FMODE_WRITE)) - continue; - f->f_mode &= ~FMODE_WRITE; - if (file_check_writeable(f) != 0) - continue; - file_release_write(f); - mnt = mntget(f->f_path.mnt); - file_list_unlock(); - /* - * This can sleep, so we can't hold - * the file_list_lock() spinlock. - */ - mnt_drop_write(mnt); - mntput(mnt); - goto retry; + file_list_lock_all(); + for_each_possible_cpu(i) { + struct file *f; + struct list_head *list; + +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_files, i); +#else + list = &sb->s_files; +#endif + list_for_each_entry(f, list, f_u.fu_list) { + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + /* This can sleep, so we can't hold the spinlock. */ + file_list_unlock_all(); + mnt_drop_write(mnt); + mntput(mnt); + goto retry; + } } - file_list_unlock(); + file_list_unlock_all(); } void __init files_init(unsigned long mempages) { int n; + int i; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); @@ -454,5 +519,7 @@ void __init files_init(unsigned long mem if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(files_cpulock, i)); percpu_counter_init(&nr_files, 0); } Index: linux-2.6/fs/open.c =================================================================== --- linux-2.6.orig/fs/open.c +++ linux-2.6/fs/open.c @@ -829,7 +829,7 @@ static struct file *__dentry_open(struct f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + file_sb_list_add(f, inode->i_sb); error = security_dentry_open(f, cred); if (error) @@ -874,7 +874,7 @@ cleanup_all: mnt_drop_write(mnt); } } - file_kill(f); + file_sb_list_del(f); f->f_path.dentry = NULL; f->f_path.mnt = NULL; cleanup_file: Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -8,6 +8,7 @@ #include #include +#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -401,6 +402,8 @@ extern struct files_stat_struct files_st extern int get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; +extern struct percpu_counter nr_inodes; +extern int get_nr_inodes(void); extern int leases_enable, lease_break_time; #ifdef CONFIG_DNOTIFY extern int dir_notify_enable; @@ -720,9 +723,15 @@ struct inode { struct hlist_node i_hash; struct list_head i_list; /* backing dev IO list */ struct list_head i_sb_list; - struct list_head i_dentry; + union { + struct list_head i_dentry; + struct rcu_head i_rcu; + }; unsigned long i_ino; - atomic_t i_count; +#ifdef CONFIG_SMP + int i_sb_list_cpu; +#endif + unsigned int i_count; unsigned int i_nlink; uid_t i_uid; gid_t i_gid; @@ -919,6 +928,9 @@ struct file { #define f_vfsmnt f_path.mnt const struct file_operations *f_op; spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ +#ifdef CONFIG_SMP + int f_sb_list_cpu; +#endif atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -943,9 +955,6 @@ struct file { unsigned long f_mnt_write_state; #endif }; -extern spinlock_t files_lock; -#define file_list_lock() spin_lock(&files_lock); -#define file_list_unlock() spin_unlock(&files_lock); #define get_file(x) atomic_long_inc(&(x)->f_count) #define file_count(x) atomic_long_read(&(x)->f_count) @@ -1338,9 +1347,17 @@ struct super_block { #endif struct xattr_handler **s_xattr; - struct list_head s_inodes; /* all inodes */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ +#ifdef CONFIG_SMP + struct list_head *s_inodes; +#else + struct list_head s_inodes; /* all inodes */ +#endif +#ifdef CONFIG_SMP + struct list_head *s_files; +#else struct list_head s_files; +#endif /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ @@ -2042,6 +2059,7 @@ extern const struct file_operations read extern const struct file_operations write_pipefifo_fops; extern const struct file_operations rdwr_pipefifo_fops; +extern void mark_files_ro(struct super_block *sb); extern int fs_may_remount_ro(struct super_block *); #ifdef CONFIG_BLOCK @@ -2175,7 +2193,6 @@ extern int insert_inode_locked4(struct i extern int insert_inode_locked(struct inode *); extern void unlock_new_inode(struct inode *); -extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); @@ -2184,15 +2201,17 @@ extern struct inode *new_inode(struct su extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); +extern void inode_sb_list_del(struct inode *inode); extern void __insert_inode_hash(struct inode *, unsigned long hashval); +extern void __remove_inode_hash(struct inode *); extern void remove_inode_hash(struct inode *); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern struct file * get_empty_filp(void); -extern void file_move(struct file *f, struct list_head *list); -extern void file_kill(struct file *f); +extern void file_sb_list_add(struct file *f, struct super_block *sb); +extern void file_sb_list_del(struct file *f); #ifdef CONFIG_BLOCK struct bio; extern void submit_bio(int, struct bio *); @@ -2397,10 +2416,20 @@ extern int generic_show_options(struct s extern void save_mount_options(struct super_block *sb, char *options); extern void replace_mount_options(struct super_block *sb, char *options); +static inline void __iget(struct inode *inode) +{ + assert_spin_locked(&inode->i_lock); + inode->i_count++; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; + /* + * Don't strictly need d_lock here? If the parent ino could change + * then surely we'd have a deeper race in the caller? + */ spin_lock(&dentry->d_lock); res = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); @@ -2476,7 +2505,8 @@ ssize_t simple_attr_write(struct file *f struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); - +int proc_nr_inodes(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #endif /* __KERNEL__ */ Index: linux-2.6/security/selinux/hooks.c =================================================================== --- linux-2.6.orig/security/selinux/hooks.c +++ linux-2.6/security/selinux/hooks.c @@ -2247,7 +2247,7 @@ static inline void flush_unauthorized_fi tty = get_current_tty(); if (tty) { - file_list_lock(); + spin_lock(&tty_files_lock); if (!list_empty(&tty->tty_files)) { struct inode *inode; @@ -2263,7 +2263,7 @@ static inline void flush_unauthorized_fi drop_tty = 1; } } - file_list_unlock(); + spin_unlock(&tty_files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ Index: linux-2.6/include/linux/tty.h =================================================================== --- linux-2.6.orig/include/linux/tty.h +++ linux-2.6/include/linux/tty.h @@ -446,6 +446,7 @@ extern struct tty_struct *tty_pair_get_t extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty); extern struct mutex tty_mutex; +extern spinlock_t tty_files_lock; extern void tty_write_unlock(struct tty_struct *tty); extern int tty_write_lock(struct tty_struct *tty, int ndelay); Index: linux-2.6/fs/dcache.c =================================================================== --- linux-2.6.orig/fs/dcache.c +++ linux-2.6/fs/dcache.c @@ -35,13 +35,34 @@ #include #include "internal.h" +/* + * Usage: + * dcache->d_inode->i_lock protects: + * - the inode alias lists, d_inode + * dcache_hash_bucket->lock protects: + * - the dcache hash table + * dcache_lru_lock protects: + * - the dcache lru lists and counters + * d_lock protects: + * - d_flags + * - d_name + * - d_lru + * - d_unhashed + * - d_subdirs and children's d_child + * + * Ordering: + * dcache->d_inode->i_lock + * dentry->d_lock + * dcache_lru_lock + * dcache_hash_bucket->lock + */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); -EXPORT_SYMBOL(dcache_lock); +EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; @@ -60,13 +81,27 @@ static struct kmem_cache *dentry_cache _ static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; -static struct hlist_head *dentry_hashtable __read_mostly; + +struct dcache_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; +static struct dcache_hash_bucket *dentry_hashtable __read_mostly; /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { + .nr_dentry = 0, .age_limit = 45, }; +static inline struct dcache_hash_bucket *d_hash(struct dentry *parent, + unsigned long hash) +{ + hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; + hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); + return dentry_hashtable + (hash & D_HASHMASK); +} + static void __d_free(struct dentry *dentry) { WARN_ON(!list_empty(&dentry->d_alias)); @@ -82,11 +117,11 @@ static void d_callback(struct rcu_head * } /* - * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry - * inside dcache_lock. + * no locks, please. */ static void d_free(struct dentry *dentry) { + BUG_ON(dentry->d_count); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); /* if dentry was never inserted into hash, immediate free is OK */ @@ -102,14 +137,13 @@ static void d_free(struct dentry *dentry */ static void dentry_iput(struct dentry * dentry) __releases(dentry->d_lock) - __releases(dcache_lock) { struct inode *inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) @@ -118,42 +152,60 @@ static void dentry_iput(struct dentry * iput(inode); } else { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } } /* - * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held. + * dentry_lru_(add|add_tail|del|del_init) must be called with d_lock held + * to protect list_empty(d_lru) condition. */ static void dentry_lru_add(struct dentry *dentry) { + spin_lock(&dcache_lru_lock); list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; + spin_unlock(&dcache_lru_lock); } static void dentry_lru_add_tail(struct dentry *dentry) { + spin_lock(&dcache_lru_lock); list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); dentry->d_sb->s_nr_dentry_unused++; dentry_stat.nr_unused++; + spin_unlock(&dcache_lru_lock); +} + +static void __dentry_lru_del(struct dentry *dentry) +{ + list_del(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; +} + +static void __dentry_lru_del_init(struct dentry *dentry) +{ + list_del_init(&dentry->d_lru); + dentry->d_sb->s_nr_dentry_unused--; + dentry_stat.nr_unused--; } static void dentry_lru_del(struct dentry *dentry) { if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + spin_lock(&dcache_lru_lock); + __dentry_lru_del(dentry); + spin_unlock(&dcache_lru_lock); } } static void dentry_lru_del_init(struct dentry *dentry) { if (likely(!list_empty(&dentry->d_lru))) { - list_del_init(&dentry->d_lru); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + spin_lock(&dcache_lru_lock); + __dentry_lru_del_init(dentry); + spin_unlock(&dcache_lru_lock); } } @@ -164,25 +216,89 @@ static void dentry_lru_del_init(struct d * The dentry must already be unhashed and removed from the LRU. * * If this is the root of the dentry tree, return NULL. + * + * d_lock and d_parent->d_lock must be held by caller, and + * are dropped by d_kill. */ static struct dentry *d_kill(struct dentry *dentry) __releases(dentry->d_lock) - __releases(dcache_lock) { struct dentry *parent; list_del(&dentry->d_u.d_child); - dentry_stat.nr_dentry--; /* For d_free, below */ - /*drops the locks, at that point nobody can reach this dentry */ - dentry_iput(dentry); + if (dentry->d_parent && dentry != dentry->d_parent) + spin_unlock(&dentry->d_parent->d_lock); if (IS_ROOT(dentry)) parent = NULL; else parent = dentry->d_parent; + /*drops the locks, at that point nobody can reach this dentry */ + dentry_iput(dentry); d_free(dentry); return parent; } +void __d_drop(struct dentry *dentry) +{ + if (!(dentry->d_flags & DCACHE_UNHASHED)) { + struct dcache_hash_bucket *b; + b = d_hash(dentry->d_parent, dentry->d_name.hash); + dentry->d_flags |= DCACHE_UNHASHED; + spin_lock(&b->lock); + hlist_del_rcu(&dentry->d_hash); + spin_unlock(&b->lock); + } +} +EXPORT_SYMBOL(__d_drop); + +void d_drop(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_drop); + +static inline struct dentry *__dget_dlock(struct dentry *dentry) +{ + dentry->d_count++; + return dentry; +} + +static inline struct dentry *__dget(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + return dentry; +} + +struct dentry *dget_parent(struct dentry *dentry) +{ + struct dentry *ret; + +repeat: + spin_lock(&dentry->d_lock); + ret = dentry->d_parent; + if (!ret) + goto out; + if (dentry == ret) { + ret->d_count++; + goto out; + } + if (!spin_trylock(&ret->d_lock)) { + spin_unlock(&dentry->d_lock); + goto repeat; + } + BUG_ON(!ret->d_count); + ret->d_count++; + spin_unlock(&ret->d_lock); +out: + spin_unlock(&dentry->d_lock); + return ret; +} +EXPORT_SYMBOL(dget_parent); + /* * This is dput * @@ -214,19 +330,20 @@ static struct dentry *d_kill(struct dent void dput(struct dentry *dentry) { + struct dentry *parent; + struct inode *inode; + if (!dentry) return; repeat: - if (atomic_read(&dentry->d_count) == 1) + if (dentry->d_count == 1) might_sleep(); - if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) - return; - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count)) { + BUG_ON(!dentry->d_count); + if (dentry->d_count > 1) { + dentry->d_count--; spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return; } @@ -234,8 +351,10 @@ repeat: * AV: ->d_delete() is _NOT_ allowed to block now. */ if (dentry->d_op && dentry->d_op->d_delete) { - if (dentry->d_op->d_delete(dentry)) - goto unhash_it; + if (dentry->d_op->d_delete(dentry)) { + __d_drop(dentry); + goto kill_it; + } } /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) @@ -244,13 +363,39 @@ repeat: dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); } - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - return; + dentry->d_count--; + spin_unlock(&dentry->d_lock); + return; -unhash_it: - __d_drop(dentry); +relock1: + spin_lock(&dentry->d_lock); kill_it: + inode = dentry->d_inode; + if (inode) { + if (!spin_trylock(&inode->i_lock)) { +relock2: + spin_unlock(&dentry->d_lock); + goto relock1; + } + } + parent = dentry->d_parent; + if (parent && parent != dentry) { + if (!spin_trylock(&parent->d_lock)) { + if (inode) + spin_unlock(&inode->i_lock); + goto relock2; + } + } + dentry->d_count--; + if (dentry->d_count) { + /* This case should be fine */ + spin_unlock(&dentry->d_lock); + if (parent && parent != dentry) + spin_unlock(&parent->d_lock); + if (inode) + spin_unlock(&inode->i_lock); + return; + } /* if dentry was on the d_lru list delete it from there */ dentry_lru_del(dentry); dentry = d_kill(dentry); @@ -275,9 +420,9 @@ int d_invalidate(struct dentry * dentry) /* * If it's already been dropped, return OK. */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return 0; } /* @@ -285,9 +430,9 @@ int d_invalidate(struct dentry * dentry) * to get rid of unused child entries. */ if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); } /* @@ -300,35 +445,18 @@ int d_invalidate(struct dentry * dentry) * we might still populate it if it was a * working directory or similar). */ - spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1) { + if (dentry->d_count > 1) { if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return -EBUSY; } } __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 0; } -/* This should be called _only_ with dcache_lock held */ - -static inline struct dentry * __dget_locked(struct dentry *dentry) -{ - atomic_inc(&dentry->d_count); - dentry_lru_del_init(dentry); - return dentry; -} - -struct dentry * dget_locked(struct dentry *dentry) -{ - return __dget_locked(dentry); -} - /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question @@ -358,18 +486,21 @@ static struct dentry * __d_find_alias(st next = tmp->next; prefetch(next); alias = list_entry(tmp, struct dentry, d_alias); + spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) discon_alias = alias; else if (!want_discon) { - __dget_locked(alias); + __dget_dlock(alias); + spin_unlock(&alias->d_lock); return alias; } } + spin_unlock(&alias->d_lock); } if (discon_alias) - __dget_locked(discon_alias); + __dget(discon_alias); return discon_alias; } @@ -378,9 +509,9 @@ struct dentry * d_find_alias(struct inod struct dentry *de = NULL; if (!list_empty(&inode->i_dentry)) { - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); de = __d_find_alias(inode, 0); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } return de; } @@ -393,20 +524,20 @@ void d_prune_aliases(struct inode *inode { struct dentry *dentry; restart: - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); - if (!atomic_read(&dentry->d_count)) { - __dget_locked(dentry); + if (!dentry->d_count) { + __dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); dput(dentry); goto restart; } spin_unlock(&dentry->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } /* @@ -419,27 +550,43 @@ restart: */ static void prune_one_dentry(struct dentry * dentry) __releases(dentry->d_lock) - __releases(dcache_lock) - __acquires(dcache_lock) { __d_drop(dentry); dentry = d_kill(dentry); /* - * Prune ancestors. Locking is simpler than in dput(), - * because dcache_lock needs to be taken anyway. + * Prune ancestors. */ - spin_lock(&dcache_lock); while (dentry) { - if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock)) + struct dentry *parent = NULL; + struct inode *inode = dentry->d_inode; + + if (inode) + spin_lock(&inode->i_lock); +again: + spin_lock(&dentry->d_lock); + if (dentry->d_parent && dentry != dentry->d_parent) { + if (!spin_trylock(&dentry->d_parent->d_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } + parent = dentry->d_parent; + } + dentry->d_count--; + if (dentry->d_count) { + if (parent) + spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + if (inode) + spin_unlock(&inode->i_lock); return; + } if (dentry->d_op && dentry->d_op->d_delete) dentry->d_op->d_delete(dentry); dentry_lru_del_init(dentry); __d_drop(dentry); dentry = d_kill(dentry); - spin_lock(&dcache_lock); } } @@ -460,10 +607,11 @@ static void __shrink_dcache_sb(struct su BUG_ON(!sb); BUG_ON((flags & DCACHE_REFERENCED) && count == NULL); - spin_lock(&dcache_lock); if (count != NULL) /* called from prune_dcache() and shrink_dcache_parent() */ cnt = *count; +relock: + spin_lock(&dcache_lru_lock); restart: if (count == NULL) list_splice_init(&sb->s_dentry_lru, &tmp); @@ -473,7 +621,10 @@ restart: struct dentry, d_lru); BUG_ON(dentry->d_sb != sb); - spin_lock(&dentry->d_lock); + if (!spin_trylock(&dentry->d_lock)) { + spin_unlock(&dcache_lru_lock); + goto relock; + } /* * If we are honouring the DCACHE_REFERENCED flag and * the dentry has this flag set, don't free it. Clear @@ -491,33 +642,61 @@ restart: if (!cnt) break; } - cond_resched_lock(&dcache_lock); + cond_resched_lock(&dcache_lru_lock); } } + spin_unlock(&dcache_lru_lock); + +again: + spin_lock(&dcache_lru_lock); /* lru_lock also protects tmp list */ while (!list_empty(&tmp)) { + struct inode *inode; + dentry = list_entry(tmp.prev, struct dentry, d_lru); - dentry_lru_del_init(dentry); - spin_lock(&dentry->d_lock); + + if (!spin_trylock(&dentry->d_lock)) { +again1: + spin_unlock(&dcache_lru_lock); + goto again; + } /* * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free * it - just keep it off the LRU list. */ - if (atomic_read(&dentry->d_count)) { + if (dentry->d_count) { + __dentry_lru_del_init(dentry); spin_unlock(&dentry->d_lock); continue; } + inode = dentry->d_inode; + if (inode && !spin_trylock(&inode->i_lock)) { +again2: + spin_unlock(&dentry->d_lock); + goto again1; + } + if (dentry->d_parent && dentry->d_parent != dentry) { + if (!spin_trylock(&dentry->d_parent->d_lock)) { + if (inode) + spin_unlock(&inode->i_lock); + goto again2; + } + } + __dentry_lru_del_init(dentry); + spin_unlock(&dcache_lru_lock); + prune_one_dentry(dentry); - /* dentry->d_lock was dropped in prune_one_dentry() */ - cond_resched_lock(&dcache_lock); + /* dentry->d_lock dropped */ + spin_lock(&dcache_lru_lock); } + if (count == NULL && !list_empty(&sb->s_dentry_lru)) goto restart; if (count != NULL) *count = cnt; if (!list_empty(&referenced)) list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lock); + spin_unlock(&dcache_lru_lock); } /** @@ -539,7 +718,6 @@ static void prune_dcache(int count) if (unused == 0 || count == 0) return; - spin_lock(&dcache_lock); restart: if (count >= unused) prune_ratio = 1; @@ -575,11 +753,9 @@ restart: if (down_read_trylock(&sb->s_umount)) { if ((sb->s_root != NULL) && (!list_empty(&sb->s_dentry_lru))) { - spin_unlock(&dcache_lock); __shrink_dcache_sb(sb, &w_count, DCACHE_REFERENCED); pruned -= w_count; - spin_lock(&dcache_lock); } up_read(&sb->s_umount); } @@ -595,7 +771,6 @@ restart: } } spin_unlock(&sb_lock); - spin_unlock(&dcache_lock); } /** @@ -624,10 +799,10 @@ static void shrink_dcache_for_umount_sub BUG_ON(!IS_ROOT(dentry)); /* detach this root from the system */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); dentry_lru_del_init(dentry); __d_drop(dentry); - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); for (;;) { /* descend to the first leaf in the current subtree */ @@ -636,14 +811,15 @@ static void shrink_dcache_for_umount_sub /* this is a branch with children - detach all of them * from the system in one go */ - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); list_for_each_entry(loop, &dentry->d_subdirs, d_u.d_child) { + spin_lock_nested(&loop->d_lock, DENTRY_D_LOCK_NESTED); dentry_lru_del_init(loop); __d_drop(loop); - cond_resched_lock(&dcache_lock); + spin_unlock(&loop->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); /* move to the first child */ dentry = list_entry(dentry->d_subdirs.next, @@ -655,7 +831,7 @@ static void shrink_dcache_for_umount_sub do { struct inode *inode; - if (atomic_read(&dentry->d_count) != 0) { + if (dentry->d_count != 0) { printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%s}" " still in use (%d)" @@ -664,20 +840,23 @@ static void shrink_dcache_for_umount_sub dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry->d_name.name, - atomic_read(&dentry->d_count), + dentry->d_count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); BUG(); } - if (IS_ROOT(dentry)) + if (IS_ROOT(dentry)) { parent = NULL; - else { + list_del(&dentry->d_u.d_child); + } else { parent = dentry->d_parent; - atomic_dec(&parent->d_count); + spin_lock(&parent->d_lock); + parent->d_count--; + list_del(&dentry->d_u.d_child); + spin_unlock(&parent->d_lock); } - list_del(&dentry->d_u.d_child); detached++; inode = dentry->d_inode; @@ -706,16 +885,12 @@ static void shrink_dcache_for_umount_sub struct dentry, d_u.d_child); } out: - /* several dentries were freed, need to correct nr_dentry */ - spin_lock(&dcache_lock); - dentry_stat.nr_dentry -= detached; - spin_unlock(&dcache_lock); + return; } /* * destroy the dentries attached to a superblock on unmounting - * - we don't need to use dentry->d_lock, and only need dcache_lock when - * removing the dentry from the system lists and hashes because: + * - we don't need to use dentry->d_lock because: * - the superblock is detached from all mountings and open files, so the * dentry trees will not be rearranged by the VFS * - s_umount is write-locked, so the memory pressure shrinker will ignore @@ -732,7 +907,9 @@ void shrink_dcache_for_umount(struct sup dentry = sb->s_root; sb->s_root = NULL; - atomic_dec(&dentry->d_count); + spin_lock(&dentry->d_lock); + dentry->d_count--; + spin_unlock(&dentry->d_lock); shrink_dcache_for_umount_subtree(dentry); while (!hlist_empty(&sb->s_anon)) { @@ -754,15 +931,19 @@ void shrink_dcache_for_umount(struct sup * Return true if the parent or its subdirectories contain * a mount point */ - int have_submounts(struct dentry *parent) { - struct dentry *this_parent = parent; + struct dentry *this_parent; struct list_head *next; + unsigned seq; + +rename_retry: + this_parent = parent; + seq = read_seqbegin(&rename_lock); - spin_lock(&dcache_lock); if (d_mountpoint(parent)) goto positive; + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -770,26 +951,56 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); /* Have we found a mount point ? */ - if (d_mountpoint(dentry)) + if (d_mountpoint(dentry)) { + spin_unlock(&dentry->d_lock); + spin_unlock(&this_parent->d_lock); goto positive; + } if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } + spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + // d_unlinked(this_parent) || XXX + read_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return 0; /* No mount points found in tree */ positive: - spin_unlock(&dcache_lock); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return 1; } @@ -809,11 +1020,17 @@ positive: */ static int select_parent(struct dentry * parent) { - struct dentry *this_parent = parent; + struct dentry *this_parent; struct list_head *next; - int found = 0; + unsigned seq; + int found; + +rename_retry: + found = 0; + this_parent = parent; + seq = read_seqbegin(&rename_lock); - spin_lock(&dcache_lock); + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -822,12 +1039,13 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_lru_del_init(dentry); /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ - if (!atomic_read(&dentry->d_count)) { + if (!dentry->d_count) { dentry_lru_add_tail(dentry); found++; } @@ -837,27 +1055,54 @@ resume: * ensures forward progress). We'll be coming back to find * the rest. */ - if (found && need_resched()) + if (found && need_resched()) { + spin_unlock(&dentry->d_lock); goto out; + } /* * Descend a level if the d_subdirs list is non-empty. */ if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } + + spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_u.d_child.next; - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + // d_unlinked(this_parent) || XXX + read_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } out: - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return found; } @@ -939,7 +1184,7 @@ struct dentry *d_alloc(struct dentry * p memcpy(dname, name->name, name->len); dname[name->len] = 0; - atomic_set(&dentry->d_count, 1); + dentry->d_count = 1; dentry->d_flags = DCACHE_UNHASHED; spin_lock_init(&dentry->d_lock); dentry->d_inode = NULL; @@ -952,19 +1197,17 @@ struct dentry *d_alloc(struct dentry * p INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_LIST_HEAD(&dentry->d_alias); + INIT_LIST_HEAD(&dentry->d_u.d_child); if (parent) { - dentry->d_parent = dget(parent); + spin_lock(&parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry->d_parent = dget_dlock(parent); dentry->d_sb = parent->d_sb; - } else { - INIT_LIST_HEAD(&dentry->d_u.d_child); - } - - spin_lock(&dcache_lock); - if (parent) list_add(&dentry->d_u.d_child, &parent->d_subdirs); - dentry_stat.nr_dentry++; - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + } return dentry; } @@ -979,7 +1222,6 @@ struct dentry *d_alloc_name(struct dentr return d_alloc(parent, &q); } -/* the caller must hold dcache_lock */ static void __d_instantiate(struct dentry *dentry, struct inode *inode) { if (inode) @@ -1006,9 +1248,11 @@ static void __d_instantiate(struct dentr void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); + if (inode) + spin_lock(&inode->i_lock); __d_instantiate(entry, inode); - spin_unlock(&dcache_lock); + if (inode) + spin_unlock(&inode->i_lock); security_d_instantiate(entry, inode); } @@ -1052,7 +1296,7 @@ static struct dentry *__d_instantiate_un continue; if (memcmp(qstr->name, name, len)) continue; - dget_locked(alias); + dget(alias); return alias; } @@ -1066,9 +1310,11 @@ struct dentry *d_instantiate_unique(stru BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); + if (inode) + spin_lock(&inode->i_lock); result = __d_instantiate_unique(entry, inode); - spin_unlock(&dcache_lock); + if (inode) + spin_unlock(&inode->i_lock); if (!result) { security_d_instantiate(entry, inode); @@ -1108,14 +1354,6 @@ struct dentry * d_alloc_root(struct inod return res; } -static inline struct hlist_head *d_hash(struct dentry *parent, - unsigned long hash) -{ - hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; - hash = hash ^ ((hash ^ GOLDEN_RATIO_PRIME) >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); -} - /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1156,10 +1394,10 @@ struct dentry *d_obtain_alias(struct ino } tmp->d_parent = tmp; /* make sure dput doesn't croak */ - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); res = __d_find_alias(inode, 0); if (res) { - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); dput(tmp); goto out_iput; } @@ -1173,8 +1411,8 @@ struct dentry *d_obtain_alias(struct ino list_add(&tmp->d_alias, &inode->i_dentry); hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon); spin_unlock(&tmp->d_lock); + spin_unlock(&inode->i_lock); - spin_unlock(&dcache_lock); return tmp; out_iput: @@ -1204,19 +1442,19 @@ struct dentry *d_splice_alias(struct ino struct dentry *new = NULL; if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); new = __d_find_alias(inode, 1); if (new) { BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_rehash(dentry); d_move(new, dentry); iput(inode); } else { - /* already taking dcache_lock, so d_add() by hand */ + /* already taken inode->i_lock, d_add() by hand */ __d_instantiate(dentry, inode); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(dentry, inode); d_rehash(dentry); } @@ -1288,10 +1526,10 @@ struct dentry *d_add_ci(struct dentry *d * Negative dentry: instantiate it unless the inode is a directory and * already has a dentry. */ - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { __d_instantiate(found, inode); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); security_d_instantiate(found, inode); return found; } @@ -1301,8 +1539,8 @@ struct dentry *d_add_ci(struct dentry *d * reference to it, move it in place and use it. */ new = list_entry(inode->i_dentry.next, struct dentry, d_alias); - dget_locked(new); - spin_unlock(&dcache_lock); + dget(new); + spin_unlock(&inode->i_lock); security_d_instantiate(found, inode); d_move(new, found); iput(inode); @@ -1324,7 +1562,7 @@ err_out: * is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned on failure. * - * __d_lookup is dcache_lock free. The hash list is protected using RCU. + * __d_lookup is global lock free. The hash list is protected using RCU. * Memory barriers are used while updating and doing lockless traversal. * To avoid races with d_move while rename is happening, d_lock is used. * @@ -1336,8 +1574,7 @@ err_out: * * The dentry unused LRU is not updated even if lookup finds the required dentry * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, - * select_parent and __dget_locked. This laziness saves lookup from dcache_lock - * acquisition. + * select_parent. This laziness saves lookup from LRU lock acquisition. * * d_lookup() is protected against the concurrent renames in some unrelated * directory using the seqlockt_t rename_lock. @@ -1346,7 +1583,7 @@ err_out: struct dentry * d_lookup(struct dentry * parent, struct qstr * name) { struct dentry * dentry = NULL; - unsigned long seq; + unsigned seq; do { seq = read_seqbegin(&rename_lock); @@ -1362,7 +1599,8 @@ struct dentry * __d_lookup(struct dentry unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; - struct hlist_head *head = d_hash(parent,hash); + struct dcache_hash_bucket *b = d_hash(parent, hash); + struct hlist_head *head = &b->head; struct dentry *found = NULL; struct hlist_node *node; struct dentry *dentry; @@ -1406,7 +1644,7 @@ struct dentry * __d_lookup(struct dentry goto next; } - atomic_inc(&dentry->d_count); + dentry->d_count++; found = dentry; spin_unlock(&dentry->d_lock); break; @@ -1456,6 +1694,7 @@ out: int d_validate(struct dentry *dentry, struct dentry *dparent) { + struct dcache_hash_bucket *b; struct hlist_head *base; struct hlist_node *lhp; @@ -1466,19 +1705,23 @@ int d_validate(struct dentry *dentry, st if (dentry->d_parent != dparent) goto out; - spin_lock(&dcache_lock); - base = d_hash(dparent, dentry->d_name.hash); - hlist_for_each(lhp,base) { + spin_lock(&dentry->d_lock); + b = d_hash(dparent, dentry->d_name.hash); + base = &b->head; + spin_lock(&b->lock); + hlist_for_each(lhp, base) { /* hlist_for_each_entry_rcu() not required for d_hash list - * as it is parsed under dcache_lock + * as it is parsed under dcache_hash_bucket->lock */ if (dentry == hlist_entry(lhp, struct dentry, d_hash)) { - __dget_locked(dentry); - spin_unlock(&dcache_lock); + spin_unlock(&b->lock); + __dget_dlock(dentry); + spin_unlock(&dentry->d_lock); return 1; } } - spin_unlock(&dcache_lock); + spin_unlock(&b->lock); + spin_unlock(&dentry->d_lock); out: return 0; } @@ -1506,14 +1749,20 @@ out: void d_delete(struct dentry * dentry) { + struct inode *inode; int isdir = 0; /* * Are we the only user? */ - spin_lock(&dcache_lock); +again: spin_lock(&dentry->d_lock); - isdir = S_ISDIR(dentry->d_inode->i_mode); - if (atomic_read(&dentry->d_count) == 1) { + inode = dentry->d_inode; + isdir = S_ISDIR(inode->i_mode); + if (dentry->d_count == 1) { + if (inode && !spin_trylock(&inode->i_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } dentry_iput(dentry); fsnotify_nameremove(dentry, isdir); return; @@ -1523,16 +1772,16 @@ void d_delete(struct dentry * dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); fsnotify_nameremove(dentry, isdir); } -static void __d_rehash(struct dentry * entry, struct hlist_head *list) +static void __d_rehash(struct dentry * entry, struct dcache_hash_bucket *b) { - entry->d_flags &= ~DCACHE_UNHASHED; - hlist_add_head_rcu(&entry->d_hash, list); + spin_lock(&b->lock); + hlist_add_head_rcu(&entry->d_hash, &b->head); + spin_unlock(&b->lock); } static void _d_rehash(struct dentry * entry) @@ -1549,11 +1798,9 @@ static void _d_rehash(struct dentry * en void d_rehash(struct dentry * entry) { - spin_lock(&dcache_lock); spin_lock(&entry->d_lock); _d_rehash(entry); spin_unlock(&entry->d_lock); - spin_unlock(&dcache_lock); } /* @@ -1630,32 +1877,46 @@ static void switch_names(struct dentry * */ static void d_move_locked(struct dentry * dentry, struct dentry * target) { - struct hlist_head *list; - + struct dcache_hash_bucket *b; if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); write_seqlock(&rename_lock); - /* - * XXXX: do we really need to take target->d_lock? - */ - if (target < dentry) { - spin_lock(&target->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + if (target->d_parent != dentry->d_parent) { + if (target->d_parent < dentry->d_parent) { + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&dentry->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } else { + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } } else { - spin_lock(&dentry->d_lock); - spin_lock_nested(&target->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock(&target->d_parent->d_lock); } - /* Move the dentry to the target hash queue, if on different bucket */ - if (d_unhashed(dentry)) - goto already_unhashed; - - hlist_del_rcu(&dentry->d_hash); + if (dentry != dentry->d_parent) { + if (target < dentry) { + spin_lock_nested(&target->d_lock, 2); + spin_lock_nested(&dentry->d_lock, 3); + } else { + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); + } + } else { + spin_lock_nested(&target->d_lock, 2); + } -already_unhashed: - list = d_hash(target->d_parent, target->d_name.hash); - __d_rehash(dentry, list); + /* Move the dentry to the target hash queue, if on different bucket */ + if (!d_unhashed(dentry)) { + b = d_hash(dentry->d_parent, dentry->d_name.hash); + spin_lock(&b->lock); + hlist_del_rcu(&dentry->d_hash); + spin_unlock(&b->lock); + } + __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); /* Unhash the target: dput() will then get rid of it */ __d_drop(target); @@ -1680,6 +1941,10 @@ already_unhashed: } list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + if (target->d_parent != dentry->d_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (target->d_parent != target) + spin_unlock(&target->d_parent->d_lock); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); @@ -1697,9 +1962,7 @@ already_unhashed: void d_move(struct dentry * dentry, struct dentry * target) { - spin_lock(&dcache_lock); d_move_locked(dentry, target); - spin_unlock(&dcache_lock); } /** @@ -1725,16 +1988,16 @@ struct dentry *d_ancestor(struct dentry * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex and the dcache_lock + * dentry->d_parent->d_inode->i_mutex * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias) - __releases(dcache_lock) { struct mutex *m1 = NULL, *m2 = NULL; struct dentry *ret; + struct inode *inode; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) @@ -1750,14 +2013,15 @@ static struct dentry *__d_unalias(struct if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + inode = alias->d_parent->d_inode; + if (!mutex_trylock(&inode->i_mutex)) goto out_err; - m2 = &alias->d_parent->d_inode->i_mutex; + m2 = &inode->i_mutex; out_unalias: d_move_locked(alias, dentry); ret = alias; out_err: - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -1779,6 +2043,12 @@ static void __d_materialise_dentry(struc dparent = dentry->d_parent; aparent = anon->d_parent; + /* XXX: hack */ + spin_lock(&aparent->d_lock); + spin_lock(&dparent->d_lock); + spin_lock(&dentry->d_lock); + spin_lock(&anon->d_lock); + dentry->d_parent = (aparent == anon) ? dentry : aparent; list_del(&dentry->d_u.d_child); if (!IS_ROOT(dentry)) @@ -1793,6 +2063,11 @@ static void __d_materialise_dentry(struc else INIT_LIST_HEAD(&anon->d_u.d_child); + spin_unlock(&anon->d_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&dparent->d_lock); + spin_unlock(&aparent->d_lock); + anon->d_flags &= ~DCACHE_DISCONNECTED; } @@ -1810,14 +2085,15 @@ struct dentry *d_materialise_unique(stru BUG_ON(!d_unhashed(dentry)); - spin_lock(&dcache_lock); - if (!inode) { actual = dentry; __d_instantiate(dentry, NULL); - goto found_lock; + d_rehash(actual); + goto out_nolock; } + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode)) { struct dentry *alias; @@ -1845,15 +2121,14 @@ struct dentry *d_materialise_unique(stru actual = __d_instantiate_unique(dentry, inode); if (!actual) actual = dentry; - else if (unlikely(!d_unhashed(actual))) - goto shouldnt_be_hashed; + else + BUG_ON(!d_unhashed(actual)); -found_lock: spin_lock(&actual->d_lock); found: _d_rehash(actual); spin_unlock(&actual->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); out_nolock: if (actual == dentry) { security_d_instantiate(dentry, inode); @@ -1862,10 +2137,6 @@ out_nolock: iput(inode); return actual; - -shouldnt_be_hashed: - spin_unlock(&dcache_lock); - BUG(); } static int prepend(char **buffer, int *buflen, const char *str, int namelen) @@ -1896,7 +2167,7 @@ static int prepend_name(char **buffer, i * Returns a pointer into the buffer or an error code if the * path was too long. * - * "buflen" should be positive. Caller holds the dcache_lock. + * "buflen" should be positive. Caller holds the path->dentry->d_lock. * * If path is not reachable from the supplied root, then the value of * root is changed (without modifying refcounts). @@ -1904,13 +2175,22 @@ static int prepend_name(char **buffer, i char *__d_path(const struct path *path, struct path *root, char *buffer, int buflen) { - struct dentry *dentry = path->dentry; - struct vfsmount *vfsmnt = path->mnt; - char *end = buffer + buflen; + struct dentry *dentry; + struct vfsmount *vfsmnt; + char *end; char *retval; + unsigned seq; - spin_lock(&vfsmount_lock); +rename_retry: + dentry = path->dentry; + vfsmnt = path->mnt; + end = buffer + buflen; prepend(&end, &buflen, "\0", 1); + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&dentry->d_lock); +unlinked: if (d_unlinked(dentry) && (prepend(&end, &buflen, " (deleted)", 10) != 0)) goto Elong; @@ -1922,7 +2202,7 @@ char *__d_path(const struct path *path, *retval = '/'; for (;;) { - struct dentry * parent; + struct dentry *parent; if (dentry == root->dentry && vfsmnt == root->mnt) break; @@ -1931,8 +2211,10 @@ char *__d_path(const struct path *path, if (vfsmnt->mnt_parent == vfsmnt) { goto global_root; } + spin_unlock(&dentry->d_lock); dentry = vfsmnt->mnt_mountpoint; vfsmnt = vfsmnt->mnt_parent; + spin_lock(&dentry->d_lock); /* can't get unlinked because locked vfsmount */ continue; } parent = dentry->d_parent; @@ -1941,11 +2223,18 @@ char *__d_path(const struct path *path, (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; retval = end; + spin_unlock(&dentry->d_lock); dentry = parent; + spin_lock(&dentry->d_lock); + if (d_unlinked(dentry)) + goto unlinked; } out: - spin_unlock(&vfsmount_lock); + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return retval; global_root: @@ -1954,6 +2243,7 @@ global_root: goto Elong; root->mnt = vfsmnt; root->dentry = dentry; + /* XXX: this could wrongly modify root if we rename retry */ goto out; Elong: @@ -1997,10 +2287,12 @@ char *d_path(const struct path *path, ch root = current->fs->root; path_get(&root); read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); + + vfsmount_read_lock(); tmp = root; res = __d_path(path, &tmp, buf, buflen); - spin_unlock(&dcache_lock); + vfsmount_read_unlock(); + path_put(&root); return res; } @@ -2031,11 +2323,19 @@ char *dynamic_dname(struct dentry *dentr */ char *dentry_path(struct dentry *dentry, char *buf, int buflen) { - char *end = buf + buflen; + char *end; char *retval; + unsigned seq; - spin_lock(&dcache_lock); +rename_retry: + end = buf + buflen; prepend(&end, &buflen, "\0", 1); + + seq = read_seqbegin(&rename_lock); + vfsmount_read_lock(); + rcu_read_lock(); /* protect parent */ + spin_lock(&dentry->d_lock); +unlinked: if (d_unlinked(dentry) && (prepend(&end, &buflen, "//deleted", 9) != 0)) goto Elong; @@ -2054,13 +2354,22 @@ char *dentry_path(struct dentry *dentry, goto Elong; retval = end; + spin_unlock(&dentry->d_lock); dentry = parent; + spin_lock(&dentry->d_lock); + if (d_unlinked(dentry)) + goto unlinked; } - spin_unlock(&dcache_lock); +out: + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + vfsmount_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return retval; Elong: - spin_unlock(&dcache_lock); - return ERR_PTR(-ENAMETOOLONG); + retval = ERR_PTR(-ENAMETOOLONG); + goto out; } /* @@ -2098,14 +2407,17 @@ SYSCALL_DEFINE2(getcwd, char __user *, b read_unlock(¤t->fs->lock); error = -ENOENT; - spin_lock(&dcache_lock); + vfsmount_read_lock(); + spin_lock(&pwd.dentry->d_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; struct path tmp = root; char * cwd; + spin_unlock(&pwd.dentry->d_lock); + /* XXX: race here, have to close (eg. return unlinked from __d_path) */ cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); - spin_unlock(&dcache_lock); + vfsmount_read_unlock(); error = PTR_ERR(cwd); if (IS_ERR(cwd)) @@ -2118,8 +2430,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, b if (copy_to_user(buf, cwd, len)) error = -EFAULT; } - } else - spin_unlock(&dcache_lock); + } else { + spin_unlock(&pwd.dentry->d_lock); + vfsmount_read_unlock(); + } out: path_put(&pwd); @@ -2147,35 +2461,39 @@ out: int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { int result; - unsigned long seq; + unsigned seq; if (new_dentry == old_dentry) return 1; - /* - * Need rcu_readlock to protect against the d_parent trashing - * due to d_move - */ - rcu_read_lock(); do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); + /* + * Need rcu_readlock to protect against the d_parent trashing + * due to d_move + */ + rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) result = 1; else result = 0; + rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); - rcu_read_unlock(); return result; } void d_genocide(struct dentry *root) { - struct dentry *this_parent = root; + struct dentry *this_parent; struct list_head *next; + unsigned seq; - spin_lock(&dcache_lock); +rename_retry: + this_parent = root; + seq = read_seqbegin(&rename_lock); + spin_lock(&this_parent->d_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -2183,21 +2501,49 @@ resume: struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; - if (d_unhashed(dentry)||!dentry->d_inode) + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (d_unhashed(dentry) || !dentry->d_inode) { + spin_unlock(&dentry->d_lock); continue; + } if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } - atomic_dec(&dentry->d_count); + dentry->d_count--; + spin_unlock(&dentry->d_lock); } if (this_parent != root) { - next = this_parent->d_u.d_child.next; - atomic_dec(&this_parent->d_count); - this_parent = this_parent->d_parent; + struct dentry *tmp; + struct dentry *child; + + tmp = this_parent->d_parent; + this_parent->d_count--; + rcu_read_lock(); + spin_unlock(&this_parent->d_lock); + child = this_parent; + this_parent = tmp; + spin_lock(&this_parent->d_lock); + /* might go back up the wrong parent if we have had a rename + * or deletion */ + if (this_parent != child->d_parent || + // d_unlinked(this_parent) || XXX + read_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + goto rename_retry; + } + rcu_read_unlock(); + next = child->d_u.d_child.next; goto resume; } - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; } /** @@ -2250,7 +2596,7 @@ static void __init dcache_init_early(voi dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_head), + sizeof(struct dcache_hash_bucket), dhash_entries, 13, HASH_EARLY, @@ -2258,8 +2604,10 @@ static void __init dcache_init_early(voi &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_HEAD(&dentry_hashtable[loop]); + for (loop = 0; loop < (1 << d_hash_shift); loop++) { + spin_lock_init(&dentry_hashtable[loop].lock); + INIT_HLIST_HEAD(&dentry_hashtable[loop].head); + } } static void __init dcache_init(void) @@ -2282,7 +2630,7 @@ static void __init dcache_init(void) dentry_hashtable = alloc_large_system_hash("Dentry cache", - sizeof(struct hlist_head), + sizeof(struct dcache_hash_bucket), dhash_entries, 13, 0, @@ -2290,8 +2638,10 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) - INIT_HLIST_HEAD(&dentry_hashtable[loop]); + for (loop = 0; loop < (1 << d_hash_shift); loop++) { + spin_lock_init(&dentry_hashtable[loop].lock); + INIT_HLIST_HEAD(&dentry_hashtable[loop].head); + } } /* SLAB cache for __getname() consumers */ @@ -2341,7 +2691,6 @@ EXPORT_SYMBOL(d_rehash); EXPORT_SYMBOL(d_splice_alias); EXPORT_SYMBOL(d_add_ci); EXPORT_SYMBOL(d_validate); -EXPORT_SYMBOL(dget_locked); EXPORT_SYMBOL(dput); EXPORT_SYMBOL(find_inode_number); EXPORT_SYMBOL(have_submounts); Index: linux-2.6/fs/namei.c =================================================================== --- linux-2.6.orig/fs/namei.c +++ linux-2.6/fs/namei.c @@ -685,15 +685,16 @@ int follow_up(struct path *path) { struct vfsmount *parent; struct dentry *mountpoint; - spin_lock(&vfsmount_lock); + + vfsmount_read_lock(); parent = path->mnt->mnt_parent; if (parent == path->mnt) { - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); return 0; } mntget(parent); mountpoint = dget(path->mnt->mnt_mountpoint); - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); @@ -764,23 +765,20 @@ static __always_inline void follow_dotdo nd->path.mnt == nd->root.mnt) { break; } - spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { nd->path.dentry = dget(nd->path.dentry->d_parent); - spin_unlock(&dcache_lock); dput(old); break; } - spin_unlock(&dcache_lock); - spin_lock(&vfsmount_lock); + vfsmount_read_lock(); parent = nd->path.mnt->mnt_parent; if (parent == nd->path.mnt) { - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); break; } mntget(parent); nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint); - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); dput(old); mntput(nd->path.mnt); nd->path.mnt = parent; @@ -2168,12 +2166,10 @@ void dentry_unhash(struct dentry *dentry { dget(dentry); shrink_dcache_parent(dentry); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) == 2) + if (dentry->d_count == 2) __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -2326,8 +2322,11 @@ static long do_unlinkat(int dfd, const c if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (inode) - atomic_inc(&inode->i_count); + if (inode) { + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); + } error = mnt_want_write(nd.path.mnt); if (error) goto exit2; Index: linux-2.6/fs/namespace.c =================================================================== --- linux-2.6.orig/fs/namespace.c +++ linux-2.6/fs/namespace.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -37,12 +39,16 @@ #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) #define HASH_SIZE (1UL << HASH_SHIFT) -/* spinlock for vfsmount related operations, inplace of dcache_lock */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +/* + * vfsmount "brlock" style spinlock for vfsmount related operations, use + * vfsmount_read_lock/vfsmount_write_lock functions. + */ +static DEFINE_PER_CPU(spinlock_t, vfsmount_lock); static int event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); static int mnt_id_start = 0; static int mnt_group_start = 1; @@ -54,6 +60,49 @@ static struct rw_semaphore namespace_sem struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); +void vfsmount_read_lock(void) +{ + spinlock_t *lock; + + lock = &get_cpu_var(vfsmount_lock); + spin_lock(lock); +} + +void vfsmount_read_unlock(void) +{ + spinlock_t *lock; + + lock = &__get_cpu_var(vfsmount_lock); + spin_unlock(lock); + put_cpu_var(vfsmount_lock); +} + +void vfsmount_write_lock(void) +{ + int i; + int nr = 0; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(vfsmount_lock, i); + spin_lock_nested(lock, nr); + nr++; + } +} + +void vfsmount_write_unlock(void) +{ + int i; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = &per_cpu(vfsmount_lock, i); + spin_unlock(lock); + } +} + static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -64,18 +113,21 @@ static inline unsigned long hash(struct #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) -/* allocation is serialized by namespace_sem */ +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialise with freeing. + */ static int mnt_alloc_id(struct vfsmount *mnt) { int res; retry: ida_pre_get(&mnt_id_ida, GFP_KERNEL); - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); if (!res) mnt_id_start = mnt->mnt_id + 1; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); if (res == -EAGAIN) goto retry; @@ -85,11 +137,11 @@ retry: static void mnt_free_id(struct vfsmount *mnt) { int id = mnt->mnt_id; - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); ida_remove(&mnt_id_ida, id); if (mnt_id_start > id) mnt_id_start = id; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); } /* @@ -125,6 +177,49 @@ void mnt_release_group_id(struct vfsmoun mnt->mnt_group_id = 0; } +static inline void add_mnt_count(struct vfsmount *mnt, int n) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_count, smp_processor_id())) += n; +#else + mnt->mnt_count += n; +#endif +} + +static inline void inc_mnt_count(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_count, smp_processor_id()))++; +#else + mnt->mnt_count++; +#endif +} + +static inline void dec_mnt_count(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_count, smp_processor_id()))--; +#else + mnt->mnt_count--; +#endif +} + +unsigned int count_mnt_count(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += *per_cpu_ptr(mnt->mnt_count, cpu); + } + + return count; +#else + return mnt->mnt_count; +#endif +} + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -141,7 +236,13 @@ struct vfsmount *alloc_vfsmnt(const char goto out_free_id; } - atomic_set(&mnt->mnt_count, 1); +#ifdef CONFIG_SMP + mnt->mnt_count = alloc_percpu(int); + if (!mnt->mnt_count) + goto out_free_devname; +#else + mnt->mnt_count = 0; +#endif INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); @@ -153,14 +254,19 @@ struct vfsmount *alloc_vfsmnt(const char #ifdef CONFIG_SMP mnt->mnt_writers = alloc_percpu(int); if (!mnt->mnt_writers) - goto out_free_devname; + goto out_free_mntcount; #else mnt->mnt_writers = 0; #endif + preempt_disable(); + inc_mnt_count(mnt); + preempt_enable(); } return mnt; #ifdef CONFIG_SMP +out_free_mntcount: + free_percpu(mnt->mnt_count); out_free_devname: kfree(mnt->mnt_devname); #endif @@ -344,7 +450,7 @@ static int mnt_make_readonly(struct vfsm { int ret = 0; - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); mnt->mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -378,15 +484,15 @@ static int mnt_make_readonly(struct vfsm */ smp_wmb(); mnt->mnt_flags &= ~MNT_WRITE_HOLD; - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); return ret; } static void __mnt_unmake_readonly(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); mnt->mnt_flags &= ~MNT_READONLY; - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); } void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) @@ -439,10 +545,11 @@ struct vfsmount *__lookup_mnt(struct vfs struct vfsmount *lookup_mnt(struct path *path) { struct vfsmount *child_mnt; - spin_lock(&vfsmount_lock); + + vfsmount_read_lock(); if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) mntget(child_mnt); - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); return child_mnt; } @@ -473,9 +580,11 @@ static void detach_mnt(struct vfsmount * old_path->mnt = mnt->mnt_parent; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt_root; - list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_hash); + list_del_init(&mnt->mnt_child); old_path->dentry->d_mounted--; + WARN_ON(mnt->mnt_mounted != 1); + mnt->mnt_mounted--; } void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, @@ -492,6 +601,8 @@ static void attach_mnt(struct vfsmount * list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(path->mnt, path->dentry)); list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); + WARN_ON(mnt->mnt_mounted != 0); + mnt->mnt_mounted++; } /* @@ -514,6 +625,8 @@ static void commit_tree(struct vfsmount list_add_tail(&mnt->mnt_hash, mount_hashtable + hash(parent, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + WARN_ON(mnt->mnt_mounted != 0); + mnt->mnt_mounted++; touch_mnt_namespace(n); } @@ -617,43 +730,79 @@ static inline void __mntput(struct vfsmo void mntput_no_expire(struct vfsmount *mnt) { -repeat: - if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { - if (likely(!mnt->mnt_pinned)) { - spin_unlock(&vfsmount_lock); - __mntput(mnt); - return; + if (likely(mnt->mnt_mounted)) { + vfsmount_read_lock(); + if (unlikely(!mnt->mnt_mounted)) { + vfsmount_read_unlock(); + goto repeat; } - atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); - mnt->mnt_pinned = 0; - spin_unlock(&vfsmount_lock); - acct_auto_close_mnt(mnt); - security_sb_umount_close(mnt); - goto repeat; + dec_mnt_count(mnt); + vfsmount_read_unlock(); + + return; } -} +repeat: + vfsmount_write_lock(); + BUG_ON(mnt->mnt_mounted); + dec_mnt_count(mnt); + if (count_mnt_count(mnt)) { + vfsmount_write_unlock(); + return; + } + if (likely(!mnt->mnt_pinned)) { + vfsmount_write_unlock(); + __mntput(mnt); + return; + } + add_mnt_count(mnt, mnt->mnt_pinned + 1); + mnt->mnt_pinned = 0; + vfsmount_write_unlock(); + acct_auto_close_mnt(mnt); + security_sb_umount_close(mnt); + goto repeat; +} EXPORT_SYMBOL(mntput_no_expire); +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + /* avoid cacheline pingpong */ + if (unlikely(mnt->mnt_expiry_mark)) + mnt->mnt_expiry_mark = 0; + mntput_no_expire(mnt); + } +} +EXPORT_SYMBOL(mntput); + +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) { + preempt_disable(); + inc_mnt_count(mnt); + preempt_enable(); + } + return mnt; +} +EXPORT_SYMBOL(mntget); + void mnt_pin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); mnt->mnt_pinned++; - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); } - EXPORT_SYMBOL(mnt_pin); void mnt_unpin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); if (mnt->mnt_pinned) { - atomic_inc(&mnt->mnt_count); + inc_mnt_count(mnt); mnt->mnt_pinned--; } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); } - EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) @@ -934,12 +1083,13 @@ int may_umount_tree(struct vfsmount *mnt int minimum_refs = 0; struct vfsmount *p; - spin_lock(&vfsmount_lock); + /* write lock needed for count_mnt_count */ + vfsmount_write_lock(); for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += atomic_read(&p->mnt_count); + actual_refs += count_mnt_count(p); minimum_refs += 2; } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); if (actual_refs > minimum_refs) return 0; @@ -965,10 +1115,12 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; - spin_lock(&vfsmount_lock); + + vfsmount_write_lock(); if (propagate_mount_busy(mnt, 2)) ret = 0; - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); + return ret; } @@ -983,13 +1135,14 @@ void release_mounts(struct list_head *he if (mnt->mnt_parent != mnt) { struct dentry *dentry; struct vfsmount *m; - spin_lock(&vfsmount_lock); + + vfsmount_write_lock(); dentry = mnt->mnt_mountpoint; m = mnt->mnt_parent; mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; m->mnt_ghosts--; - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); dput(dentry); mntput(m); } @@ -1013,6 +1166,8 @@ void umount_tree(struct vfsmount *mnt, i __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; list_del_init(&p->mnt_child); + WARN_ON(p->mnt_mounted != 1); + p->mnt_mounted--; if (p->mnt_parent != p) { p->mnt_parent->mnt_ghosts++; p->mnt_mountpoint->d_mounted--; @@ -1044,8 +1199,16 @@ static int do_umount(struct vfsmount *mn flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; - if (atomic_read(&mnt->mnt_count) != 2) + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + vfsmount_write_lock(); + if (count_mnt_count(mnt) != 2) { + vfsmount_write_lock(); return -EBUSY; + } + vfsmount_write_unlock(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -1087,7 +1250,7 @@ static int do_umount(struct vfsmount *mn } down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); event++; if (!(flags & MNT_DETACH)) @@ -1099,7 +1262,7 @@ static int do_umount(struct vfsmount *mn umount_tree(mnt, 1, &umount_list); retval = 0; } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); if (retval) security_sb_umount_busy(mnt); up_write(&namespace_sem); @@ -1206,19 +1369,19 @@ struct vfsmount *copy_tree(struct vfsmou q = clone_mnt(p, p->mnt_root, flag); if (!q) goto Enomem; - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &path); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); } } return res; Enomem: if (res) { LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); umount_tree(res, 0, &umount_list); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); release_mounts(&umount_list); } return NULL; @@ -1237,9 +1400,9 @@ void drop_collected_mounts(struct vfsmou { LIST_HEAD(umount_list); down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); up_write(&namespace_sem); release_mounts(&umount_list); } @@ -1357,7 +1520,7 @@ static int attach_recursive_mnt(struct v set_mnt_shared(p); } - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); @@ -1371,7 +1534,8 @@ static int attach_recursive_mnt(struct v list_del_init(&child->mnt_hash); commit_tree(child); } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); + return 0; out_cleanup_ids: @@ -1433,10 +1597,10 @@ static int do_change_type(struct path *p goto out_unlock; } - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); out_unlock: up_write(&namespace_sem); @@ -1480,9 +1644,10 @@ static int do_loopback(struct path *path err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); + + vfsmount_write_lock(); umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); release_mounts(&umount_list); } @@ -1540,9 +1705,9 @@ static int do_remount(struct path *path, if (!err) { security_sb_post_remount(path->mnt, flags, data); - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); touch_mnt_namespace(path->mnt->mnt_ns); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); } return err; } @@ -1717,7 +1882,7 @@ void mark_mounts_for_expiry(struct list_ return; down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -1736,7 +1901,7 @@ void mark_mounts_for_expiry(struct list_ touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1, &umounts); } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); up_write(&namespace_sem); release_mounts(&umounts); @@ -2011,9 +2176,9 @@ static struct mnt_namespace *dup_mnt_ns( kfree(new_ns); return ERR_PTR(-ENOMEM); } - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -2210,7 +2375,7 @@ SYSCALL_DEFINE2(pivot_root, const char _ goto out2; /* not attached */ /* make sure we can reach put_old from new_root */ tmp = old.mnt; - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -2230,7 +2395,7 @@ SYSCALL_DEFINE2(pivot_root, const char _ /* mount new_root on / */ attach_mnt(new.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); chroot_fs_refs(&root, &new); security_sb_post_pivotroot(&root, &new); error = 0; @@ -2246,7 +2411,7 @@ out1: out0: return error; out3: - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); goto out2; } @@ -2276,6 +2441,7 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { unsigned u; + int i; int err; init_rwsem(&namespace_sem); @@ -2293,6 +2459,9 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(vfsmount_lock, i)); + err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2308,16 +2477,22 @@ void put_mnt_ns(struct mnt_namespace *ns { struct vfsmount *root; LIST_HEAD(umount_list); + spinlock_t *lock; - if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) + lock = &get_cpu_var(vfsmount_lock); + if (!atomic_dec_and_lock(&ns->count, lock)) { + put_cpu_var(vfsmount_lock); return; + } root = ns->root; ns->root = NULL; - spin_unlock(&vfsmount_lock); + spin_unlock(lock); + put_cpu_var(vfsmount_lock); + down_write(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(root, 0, &umount_list); - spin_unlock(&vfsmount_lock); + vfsmount_write_lock(); + umount_tree(root, 0, &umount_list); + vfsmount_write_unlock(); up_write(&namespace_sem); release_mounts(&umount_list); kfree(ns); Index: linux-2.6/fs/pnode.c =================================================================== --- linux-2.6.orig/fs/pnode.c +++ linux-2.6/fs/pnode.c @@ -264,12 +264,12 @@ int propagate_mnt(struct vfsmount *dest_ prev_src_mnt = child; } out: - spin_lock(&vfsmount_lock); + vfsmount_write_lock(); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); umount_tree(child, 0, &umount_list); } - spin_unlock(&vfsmount_lock); + vfsmount_write_unlock(); release_mounts(&umount_list); return ret; } @@ -279,7 +279,7 @@ out: */ static inline int do_refcount_check(struct vfsmount *mnt, int count) { - int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts; + int mycount = count_mnt_count(mnt) - mnt->mnt_ghosts; return (mycount > count); } Index: linux-2.6/fs/proc/base.c =================================================================== --- linux-2.6.orig/fs/proc/base.c +++ linux-2.6/fs/proc/base.c @@ -652,12 +652,12 @@ static unsigned mounts_poll(struct file poll_wait(file, &ns->poll, wait); - spin_lock(&vfsmount_lock); + vfsmount_read_lock(); if (p->event != ns->event) { p->event = ns->event; res |= POLLERR | POLLPRI; } - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); return res; } Index: linux-2.6/include/linux/mount.h =================================================================== --- linux-2.6.orig/include/linux/mount.h +++ linux-2.6/include/linux/mount.h @@ -56,20 +56,20 @@ struct vfsmount { struct mnt_namespace *mnt_ns; /* containing namespace */ int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ - /* - * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount - * to let these frequently modified fields in a separate cache line - * (so that reads of mnt_flags wont ping-pong on SMP machines) - */ - atomic_t mnt_count; int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; + int mnt_mounted; #ifdef CONFIG_SMP int *mnt_writers; #else int mnt_writers; #endif +#ifdef CONFIG_SMP + int *mnt_count; +#else + int mnt_count; +#endif }; static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) @@ -81,32 +81,28 @@ static inline int *get_mnt_writers_ptr(s #endif } -static inline struct vfsmount *mntget(struct vfsmount *mnt) -{ - if (mnt) - atomic_inc(&mnt->mnt_count); - return mnt; -} - struct file; /* forward dec */ +extern void vfsmount_read_lock(void); +extern void vfsmount_read_unlock(void); +extern void vfsmount_write_lock(void); +extern void vfsmount_write_unlock(void); + +extern unsigned int count_mnt_count(struct vfsmount *mnt); + extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern int mnt_clone_write(struct vfsmount *mnt); extern void mnt_drop_write(struct vfsmount *mnt); + extern void mntput_no_expire(struct vfsmount *mnt); +extern struct vfsmount *mntget(struct vfsmount *mnt); +extern void mntput(struct vfsmount *mnt); + extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); extern int __mnt_is_readonly(struct vfsmount *mnt); -static inline void mntput(struct vfsmount *mnt) -{ - if (mnt) { - mnt->mnt_expiry_mark = 0; - mntput_no_expire(mnt); - } -} - extern struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data); @@ -123,7 +119,6 @@ extern int do_add_mount(struct vfsmount extern void mark_mounts_for_expiry(struct list_head *mounts); -extern spinlock_t vfsmount_lock; extern dev_t name_to_dev_t(char *name); #endif /* _LINUX_MOUNT_H */ Index: linux-2.6/kernel/audit_tree.c =================================================================== --- linux-2.6.orig/kernel/audit_tree.c +++ linux-2.6/kernel/audit_tree.c @@ -758,15 +758,15 @@ int audit_tag_tree(char *old, char *new) continue; } - spin_lock(&vfsmount_lock); + vfsmount_read_lock(); if (!is_under(mnt, dentry, &path)) { - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); path_put(&path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } - spin_unlock(&vfsmount_lock); + vfsmount_read_unlock(); path_put(&path); list_for_each_entry(p, &list, mnt_list) { Index: linux-2.6/security/tomoyo/realpath.c =================================================================== --- linux-2.6.orig/security/tomoyo/realpath.c +++ linux-2.6/security/tomoyo/realpath.c @@ -96,16 +96,14 @@ int tomoyo_realpath_from_path2(struct pa root = current->fs->root; path_get(&root); read_unlock(¤t->fs->lock); - spin_lock(&vfsmount_lock); + vfsmount_read_lock(); if (root.mnt && root.mnt->mnt_ns) ns_root.mnt = mntget(root.mnt->mnt_ns->root); if (ns_root.mnt) ns_root.dentry = dget(ns_root.mnt->mnt_root); - spin_unlock(&vfsmount_lock); - spin_lock(&dcache_lock); tmp = ns_root; sp = __d_path(path, &tmp, newname, newname_len); - spin_unlock(&dcache_lock); + vfsmount_read_unlock(); path_put(&root); path_put(&ns_root); } Index: linux-2.6/fs/libfs.c =================================================================== --- linux-2.6.orig/fs/libfs.c +++ linux-2.6/fs/libfs.c @@ -14,6 +14,11 @@ #include +static inline int simple_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -79,7 +84,8 @@ int dcache_dir_close(struct inode *inode loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) { - mutex_lock(&file->f_path.dentry->d_inode->i_mutex); + struct dentry *dentry = file->f_path.dentry; + mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -87,7 +93,7 @@ loff_t dcache_dir_lseek(struct file *fil if (offset >= 0) break; default: - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -97,21 +103,27 @@ loff_t dcache_dir_lseek(struct file *fil struct dentry *cursor = file->private_data; loff_t n = file->f_pos - 2; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED); list_del(&cursor->d_u.d_child); - p = file->f_path.dentry->d_subdirs.next; - while (n && p != &file->f_path.dentry->d_subdirs) { + spin_unlock(&cursor->d_lock); + p = dentry->d_subdirs.next; + while (n && p != &dentry->d_subdirs) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); - if (!d_unhashed(next) && next->d_inode) + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(next)) n--; + spin_unlock(&next->d_lock); p = p->next; } + spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED); list_add_tail(&cursor->d_u.d_child, p); - spin_unlock(&dcache_lock); + spin_unlock(&cursor->d_lock); + spin_unlock(&dentry->d_lock); } } - mutex_unlock(&file->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); return offset; } @@ -151,29 +163,38 @@ int dcache_readdir(struct file * filp, v i++; /* fallthrough */ default: - spin_lock(&dcache_lock); - if (filp->f_pos == 2) + spin_lock(&dentry->d_lock); + if (filp->f_pos == 2) { + spin_lock_nested(&cursor->d_lock, DENTRY_D_LOCK_NESTED); list_move(q, &dentry->d_subdirs); + spin_unlock(&cursor->d_lock); + } for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); - if (d_unhashed(next) || !next->d_inode) + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); + if (!simple_positive(next)) { + spin_unlock(&next->d_lock); continue; + } - spin_unlock(&dcache_lock); + spin_unlock(&next->d_lock); + spin_unlock(&dentry->d_lock); if (filldir(dirent, next->d_name.name, next->d_name.len, filp->f_pos, next->d_inode->i_ino, dt_type(next->d_inode)) < 0) return 0; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); + spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); /* next is still alive */ list_move(q, p); + spin_unlock(&next->d_lock); p = q; filp->f_pos++; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); } return 0; } @@ -244,6 +265,7 @@ int get_sb_pseudo(struct file_system_typ d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; + mnt->mnt_mounted++; simple_set_mnt(mnt, s); return 0; @@ -258,29 +280,31 @@ int simple_link(struct dentry *old_dentr inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); dget(dentry); d_instantiate(dentry, inode); return 0; } -static inline int simple_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; - spin_lock(&dcache_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) - if (simple_positive(child)) + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); goto out; + } + spin_unlock(&child->d_lock); + } ret = 1; out: - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return ret; } Index: linux-2.6/include/linux/dcache.h =================================================================== --- linux-2.6.orig/include/linux/dcache.h +++ linux-2.6/include/linux/dcache.h @@ -37,8 +37,8 @@ struct qstr { }; struct dentry_stat_t { - int nr_dentry; - int nr_unused; + int nr_dentry; /* unused */ + int nr_unused; /* protected by dcache_lru_lock */ int age_limit; /* age in seconds */ int want_pages; /* pages requested by system */ int dummy[2]; @@ -87,7 +87,7 @@ full_name_hash(const unsigned char *name #endif struct dentry { - atomic_t d_count; + unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ int d_mounted; @@ -150,13 +150,13 @@ struct dentry_operations { /* locking rules: - big lock dcache_lock d_lock may block -d_revalidate: no no no yes -d_hash no no no yes -d_compare: no yes yes no -d_delete: no yes no no -d_release: no no no yes -d_iput: no no no yes + big lock d_lock may block +d_revalidate: no no yes +d_hash no no yes +d_compare: no yes no +d_delete: no no no +d_release: no no yes +d_iput: no no yes */ /* d_flags entries */ @@ -186,7 +186,6 @@ d_iput: no no no yes #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ -extern spinlock_t dcache_lock; extern seqlock_t rename_lock; /** @@ -204,23 +203,8 @@ extern seqlock_t rename_lock; * * __d_drop requires dentry->d_lock. */ - -static inline void __d_drop(struct dentry *dentry) -{ - if (!(dentry->d_flags & DCACHE_UNHASHED)) { - dentry->d_flags |= DCACHE_UNHASHED; - hlist_del_rcu(&dentry->d_hash); - } -} - -static inline void d_drop(struct dentry *dentry) -{ - spin_lock(&dcache_lock); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); -} +void d_drop(struct dentry *dentry); +void __d_drop(struct dentry *dentry); static inline int dname_external(struct dentry *dentry) { @@ -318,28 +302,31 @@ extern char *dentry_path(struct dentry * /* Allocation counts.. */ /** - * dget, dget_locked - get a reference to a dentry + * dget, dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. dget() should never be - * called for dentries with zero reference counter. For these cases - * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold dcache_lock - * and call dget_locked() instead of dget(). + * destroyed when it has references. */ - +static inline struct dentry *dget_dlock(struct dentry *dentry) +{ + if (dentry) + dentry->d_count++; + return dentry; +} + static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { - BUG_ON(!atomic_read(&dentry->d_count)); - atomic_inc(&dentry->d_count); + spin_lock(&dentry->d_lock); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); } return dentry; } -extern struct dentry * dget_locked(struct dentry *); +extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed @@ -358,16 +345,6 @@ static inline int d_unlinked(struct dent return d_unhashed(dentry) && !IS_ROOT(dentry); } -static inline struct dentry *dget_parent(struct dentry *dentry) -{ - struct dentry *ret; - - spin_lock(&dentry->d_lock); - ret = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - return ret; -} - extern void dput(struct dentry *); static inline int d_mountpoint(struct dentry *dentry) Index: linux-2.6/kernel/sysctl.c =================================================================== --- linux-2.6.orig/kernel/sysctl.c +++ linux-2.6/kernel/sysctl.c @@ -1443,7 +1443,7 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_inodes, }, { .ctl_name = FS_STATINODE, @@ -1451,7 +1451,7 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_inodes, }, { .procname = "file-nr", @@ -1479,6 +1479,12 @@ static struct ctl_table fs_table[] = { .extra2 = &sysctl_nr_open_max, }, { + /* + * dentry_stat has an atomic_t member, so this is a bit of + * a hack, but it works for the moment, and I won't bother + * changing it now because we'll probably want to change to + * a more scalable counter anyway. + */ .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat, Index: linux-2.6/fs/configfs/dir.c =================================================================== --- linux-2.6.orig/fs/configfs/dir.c +++ linux-2.6/fs/configfs/dir.c @@ -399,8 +399,7 @@ static void remove_dir(struct dentry * d if (d->d_inode) simple_rmdir(parent->d_inode,d); - pr_debug(" o %s removing done (%d)\n",d->d_name.name, - atomic_read(&d->d_count)); + pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); dput(parent); } Index: linux-2.6/fs/locks.c =================================================================== --- linux-2.6.orig/fs/locks.c +++ linux-2.6/fs/locks.c @@ -1374,8 +1374,7 @@ int generic_setlease(struct file *filp, if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; if ((arg == F_WRLCK) - && ((atomic_read(&dentry->d_count) > 1) - || (atomic_read(&inode->i_count) > 1))) + && (dentry->d_count > 1 || inode->i_count > 1)) goto out; } Index: linux-2.6/fs/autofs4/expire.c =================================================================== --- linux-2.6.orig/fs/autofs4/expire.c +++ linux-2.6/fs/autofs4/expire.c @@ -93,22 +93,59 @@ done: /* * Calculate next entry in top down tree traversal. * From next_mnt in namespace.c - elegant. + * + * How is this supposed to work if we drop autofs4_lock between calls anyway? + * How does it cope with renames? + * And also callers dput the returned dentry before taking autofs4_lock again + * so what prevents it from being freed?? */ -static struct dentry *next_dentry(struct dentry *p, struct dentry *root) +static struct dentry *get_next_positive_dentry(struct dentry *p, + struct dentry *root) { - struct list_head *next = p->d_subdirs.next; + struct list_head *next; + struct dentry *ret; + spin_lock(&autofs4_lock); +again: + spin_lock(&p->d_lock); + next = p->d_subdirs.next; if (next == &p->d_subdirs) { while (1) { - if (p == root) + struct dentry *parent; + + if (p == root) { + spin_unlock(&p->d_lock); + spin_unlock(&autofs4_lock); return NULL; + } + + parent = p->d_parent; + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&p->d_lock); + goto again; + } + spin_unlock(&p->d_lock); next = p->d_u.d_child.next; - if (next != &p->d_parent->d_subdirs) + p = parent; + if (next != &parent->d_subdirs) break; - p = p->d_parent; } } - return list_entry(next, struct dentry, d_u.d_child); + ret = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(ret)) { + spin_unlock(&ret->d_lock); + p = ret; + goto again; + } + dget_dlock(ret); + spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&autofs4_lock); + + return ret; } /* @@ -158,18 +195,11 @@ static int autofs4_tree_busy(struct vfsm if (!simple_positive(top)) return 1; - spin_lock(&dcache_lock); - for (p = top; p; p = next_dentry(p, top)) { - /* Negative dentry - give up */ - if (!simple_positive(p)) - continue; + for (p = dget(top); p; p = get_next_positive_dentry(p, top)) { DPRINTK("dentry %p %.*s", p, (int) p->d_name.len, p->d_name.name); - p = dget(p); - spin_unlock(&dcache_lock); - /* * Is someone visiting anywhere in the subtree ? * If there's no mount we need to check the usage @@ -198,16 +228,14 @@ static int autofs4_tree_busy(struct vfsm else ino_count++; - if (atomic_read(&p->d_count) > ino_count) { + if (p->d_count > ino_count) { top_ino->last_used = jiffies; dput(p); return 1; } } dput(p); - spin_lock(&dcache_lock); } - spin_unlock(&dcache_lock); /* Timeout of a tree mount is ultimately determined by its top dentry */ if (!autofs4_can_expire(top, timeout, do_now)) @@ -226,18 +254,11 @@ static struct dentry *autofs4_check_leav DPRINTK("parent %p %.*s", parent, (int)parent->d_name.len, parent->d_name.name); - spin_lock(&dcache_lock); - for (p = parent; p; p = next_dentry(p, parent)) { - /* Negative dentry - give up */ - if (!simple_positive(p)) - continue; + for (p = dget(parent); p; p = get_next_positive_dentry(p, parent)) { DPRINTK("dentry %p %.*s", p, (int) p->d_name.len, p->d_name.name); - p = dget(p); - spin_unlock(&dcache_lock); - if (d_mountpoint(p)) { /* Can we umount this guy */ if (autofs4_mount_busy(mnt, p)) @@ -249,9 +270,7 @@ static struct dentry *autofs4_check_leav } cont: dput(p); - spin_lock(&dcache_lock); } - spin_unlock(&dcache_lock); return NULL; } @@ -294,6 +313,8 @@ struct dentry *autofs4_expire_direct(str * A tree is eligible if :- * - it is unused by any user process * - it has been unused for exp_timeout time + * This seems to be racy dropping autofs4_lock and asking for next->next after + * the lock has been dropped. */ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct vfsmount *mnt, @@ -315,7 +336,8 @@ struct dentry *autofs4_expire_indirect(s now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&root->d_lock); next = root->d_subdirs.next; /* On exit from the loop expire is set to a dgot dentry @@ -329,8 +351,11 @@ struct dentry *autofs4_expire_indirect(s continue; } - dentry = dget(dentry); - spin_unlock(&dcache_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry = dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&root->d_lock); + spin_unlock(&autofs4_lock); spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); @@ -347,7 +372,7 @@ struct dentry *autofs4_expire_indirect(s /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 2; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; /* Can we umount this guy */ @@ -369,7 +394,7 @@ struct dentry *autofs4_expire_indirect(s if (!exp_leaves) { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { @@ -383,7 +408,7 @@ struct dentry *autofs4_expire_indirect(s } else { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; - if (atomic_read(&dentry->d_count) > ino_count) + if (dentry->d_count > ino_count) goto next; expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); @@ -395,10 +420,12 @@ struct dentry *autofs4_expire_indirect(s next: spin_unlock(&sbi->fs_lock); dput(dentry); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&root->d_lock); next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&root->d_lock); + spin_unlock(&autofs4_lock); return NULL; found: @@ -408,9 +435,13 @@ found: ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&dcache_lock); - list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); - spin_unlock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&expired->d_parent->d_lock); + spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + spin_unlock(&expired->d_lock); + spin_unlock(&expired->d_parent->d_lock); + spin_unlock(&autofs4_lock); return expired; } Index: linux-2.6/fs/autofs4/root.c =================================================================== --- linux-2.6.orig/fs/autofs4/root.c +++ linux-2.6/fs/autofs4/root.c @@ -17,8 +17,11 @@ #include #include #include +#include #include "autofs_i.h" +DEFINE_SPINLOCK(autofs4_lock); + static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); @@ -92,12 +95,15 @@ static int autofs4_dir_open(struct inode * autofs file system so just let the libfs routines handle * it. */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&dentry->d_lock); if (!d_mountpoint(dentry) && __simple_empty(dentry)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); return -ENOENT; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); out: return dcache_dir_open(inode, file); @@ -211,10 +217,12 @@ static void *autofs4_follow_link(struct * multi-mount with no root mount offset. So don't try to * mount it again. */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&dentry->d_lock); if (dentry->d_flags & DCACHE_AUTOFS_PENDING || (!d_mountpoint(dentry) && __simple_empty(dentry))) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); status = try_to_fill_dentry(dentry, 0); if (status) @@ -222,7 +230,8 @@ static void *autofs4_follow_link(struct goto follow; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&autofs4_lock); follow: /* * If there is no root mount it must be an autofs @@ -292,13 +301,13 @@ static int autofs4_revalidate(struct den return 0; /* Check for a non-mountpoint directory with no contents */ - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); if (S_ISDIR(dentry->d_inode->i_mode) && !d_mountpoint(dentry) && __simple_empty(dentry)) { DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); /* The daemon never causes a mount to trigger */ if (oz_mode) @@ -314,7 +323,7 @@ static int autofs4_revalidate(struct den return status; } - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return 1; } @@ -366,7 +375,7 @@ static struct dentry *autofs4_lookup_act const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { @@ -380,7 +389,7 @@ static struct dentry *autofs4_lookup_act spin_lock(&dentry->d_lock); /* Already gone? */ - if (atomic_read(&dentry->d_count) == 0) + if (dentry->d_count == 0) goto next; qstr = &dentry->d_name; @@ -396,17 +405,17 @@ static struct dentry *autofs4_lookup_act goto next; if (d_unhashed(dentry)) { - dget(dentry); + dget_dlock(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return dentry; } next: spin_unlock(&dentry->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return NULL; } @@ -418,7 +427,7 @@ static struct dentry *autofs4_lookup_exp const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; list_for_each(p, head) { @@ -448,17 +457,17 @@ static struct dentry *autofs4_lookup_exp goto next; if (d_unhashed(dentry)) { - dget(dentry); + dget_dlock(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return dentry; } next: spin_unlock(&dentry->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return NULL; } @@ -704,7 +713,7 @@ static int autofs4_dir_unlink(struct ino dir->i_mtime = CURRENT_TIME; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); @@ -712,7 +721,7 @@ static int autofs4_dir_unlink(struct ino spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); return 0; } @@ -729,19 +738,21 @@ static int autofs4_dir_rmdir(struct inod if (!autofs4_oz_mode(sbi)) return -EACCES; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); + spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&sbi->lookup_lock); + spin_unlock(&autofs4_lock); return -ENOTEMPTY; } - spin_lock(&sbi->lookup_lock); if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); spin_unlock(&sbi->lookup_lock); - spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); Index: linux-2.6/fs/coda/dir.c =================================================================== --- linux-2.6.orig/fs/coda/dir.c +++ linux-2.6/fs/coda/dir.c @@ -302,7 +302,9 @@ static int coda_link(struct dentry *sour } coda_dir_update_mtime(dir_inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(de, inode); inc_nlink(inode); @@ -611,7 +613,7 @@ static int coda_dentry_revalidate(struct if (cii->c_flags & C_FLUSH) coda_flag_inode_children(inode, C_FLUSH); - if (atomic_read(&de->d_count) > 1) + if (de->d_count > 1) /* pretend it's valid, but don't change the flags */ goto out; Index: linux-2.6/fs/ecryptfs/inode.c =================================================================== --- linux-2.6.orig/fs/ecryptfs/inode.c +++ linux-2.6/fs/ecryptfs/inode.c @@ -263,7 +263,7 @@ int ecryptfs_lookup_and_interpose_lower( ecryptfs_dentry->d_parent)); lower_inode = lower_dentry->d_inode; fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); - BUG_ON(!atomic_read(&lower_dentry->d_count)); + BUG_ON(!lower_dentry->d_count); ecryptfs_set_dentry_private(ecryptfs_dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); Index: linux-2.6/fs/hpfs/namei.c =================================================================== --- linux-2.6.orig/fs/hpfs/namei.c +++ linux-2.6/fs/hpfs/namei.c @@ -415,7 +415,7 @@ again: mutex_unlock(&hpfs_i(inode)->i_parent_mutex); d_drop(dentry); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1 || + if (dentry->d_count > 1 || generic_permission(inode, MAY_WRITE, NULL) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { Index: linux-2.6/fs/nfs/dir.c =================================================================== --- linux-2.6.orig/fs/nfs/dir.c +++ linux-2.6/fs/nfs/dir.c @@ -1325,7 +1325,7 @@ static int nfs_sillyrename(struct inode dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - atomic_read(&dentry->d_count)); + dentry->d_count); nfs_inc_stats(dir, NFSIOS_SILLYRENAME); /* @@ -1432,11 +1432,9 @@ static int nfs_unlink(struct inode *dir, dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); - if (atomic_read(&dentry->d_count) > 1) { + if (dentry->d_count > 1) { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); /* Start asynchronous writeout of the inode */ write_inode_now(dentry->d_inode, 0); error = nfs_sillyrename(dir, dentry); @@ -1447,7 +1445,6 @@ static int nfs_unlink(struct inode *dir, need_rehash = 1; } spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); error = nfs_safe_remove(dentry); if (!error || error == -ENOENT) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -1539,7 +1536,9 @@ nfs_link(struct dentry *old_dentry, stru d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_add(dentry, inode); } return error; @@ -1589,7 +1588,7 @@ static int nfs_rename(struct inode *old_ dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, - atomic_read(&new_dentry->d_count)); + new_dentry->d_count); /* * First check whether the target is busy ... we can't @@ -1605,7 +1604,7 @@ static int nfs_rename(struct inode *old_ error = -EISDIR; if (!S_ISDIR(old_inode->i_mode)) goto out; - } else if (atomic_read(&new_dentry->d_count) > 2) { + } else if (new_dentry->d_count > 2) { int err; /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, @@ -1620,7 +1619,7 @@ static int nfs_rename(struct inode *old_ new_inode = NULL; /* instantiate the replacement target */ d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) + } else if (new_dentry->d_count > 1) /* dentry still busy? */ goto out; } @@ -1629,7 +1628,7 @@ go_ahead: /* * ... prune child dentries and writebacks if needed. */ - if (atomic_read(&old_dentry->d_count) > 1) { + if (old_dentry->d_count > 1) { if (S_ISREG(old_inode->i_mode)) nfs_wb_all(old_inode); shrink_dcache_parent(old_dentry); Index: linux-2.6/fs/nfsd/vfs.c =================================================================== --- linux-2.6.orig/fs/nfsd/vfs.c +++ linux-2.6/fs/nfsd/vfs.c @@ -1754,8 +1754,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru goto out_dput_new; if (svc_msnfs(ffhp) && - ((atomic_read(&odentry->d_count) > 1) - || (atomic_read(&ndentry->d_count) > 1))) { + ((odentry->d_count > 1) || (ndentry->d_count > 1))) { host_err = -EPERM; goto out_dput_new; } @@ -1841,7 +1840,7 @@ nfsd_unlink(struct svc_rqst *rqstp, stru if (type != S_IFDIR) { /* It's UNLINK */ #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (atomic_read(&rdentry->d_count) > 1)) { + (rdentry->d_count > 1)) { host_err = -EPERM; } else #endif Index: linux-2.6/fs/exportfs/expfs.c =================================================================== --- linux-2.6.orig/fs/exportfs/expfs.c +++ linux-2.6/fs/exportfs/expfs.c @@ -43,24 +43,26 @@ find_acceptable_alias(struct dentry *res void *context) { struct dentry *dentry, *toput = NULL; + struct inode *inode; if (acceptable(context, result)) return result; - spin_lock(&dcache_lock); - list_for_each_entry(dentry, &result->d_inode->i_dentry, d_alias) { - dget_locked(dentry); - spin_unlock(&dcache_lock); + inode = result->d_inode; + spin_lock(&inode->i_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + dget(dentry); + spin_unlock(&inode->i_lock); if (toput) dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; } - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); toput = dentry; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); if (toput) dput(toput); @@ -74,12 +76,19 @@ static struct dentry * find_disconnected_root(struct dentry *dentry) { dget(dentry); +again: spin_lock(&dentry->d_lock); while (!IS_ROOT(dentry) && (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) { struct dentry *parent = dentry->d_parent; - dget(parent); + + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } + dget_dlock(parent); spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); dput(dentry); dentry = parent; spin_lock(&dentry->d_lock); Index: linux-2.6/fs/notify/inotify/inotify.c =================================================================== --- linux-2.6.orig/fs/notify/inotify/inotify.c +++ linux-2.6/fs/notify/inotify/inotify.c @@ -185,23 +185,25 @@ static void set_dentry_child_flags(struc { struct dentry *alias; - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); list_for_each_entry(alias, &inode->i_dentry, d_alias) { struct dentry *child; + spin_lock(&alias->d_lock); list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { if (!child->d_inode) continue; - spin_lock(&child->d_lock); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (watched) child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; else child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } + spin_unlock(&alias->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } /* @@ -269,6 +271,7 @@ void inotify_d_instantiate(struct dentry if (!inode) return; + /* XXX: need parent lock in place of dcache_lock? */ spin_lock(&entry->d_lock); parent = entry->d_parent; if (parent->d_inode && inotify_inode_watched(parent->d_inode)) @@ -283,6 +286,7 @@ void inotify_d_move(struct dentry *entry { struct dentry *parent; + /* XXX: need parent lock in place of dcache_lock? */ parent = entry->d_parent; if (inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; @@ -339,18 +343,28 @@ void inotify_dentry_parent_queue_event(s if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED)) return; +again: spin_lock(&dentry->d_lock); parent = dentry->d_parent; + if (parent != dentry && !spin_trylock(&parent->d_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } inode = parent->d_inode; if (inotify_inode_watched(inode)) { - dget(parent); + dget_dlock(parent); spin_unlock(&dentry->d_lock); + if (parent != dentry) + spin_unlock(&parent->d_lock); inotify_inode_queue_event(inode, mask, cookie, name, dentry->d_inode); dput(parent); - } else + } else { spin_unlock(&dentry->d_lock); + if (parent != dentry) + spin_unlock(&parent->d_lock); + } } EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event); @@ -371,76 +385,86 @@ EXPORT_SYMBOL_GPL(inotify_get_cookie); * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. * We temporarily drop inode_lock, however, and CAN block. */ -void inotify_unmount_inodes(struct list_head *list) +void inotify_unmount_inodes(struct super_block *sb) { - struct inode *inode, *next_i, *need_iput = NULL; + int i; - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { - struct inotify_watch *watch, *next_w; - struct inode *need_iput_tmp; - struct list_head *watches; - - /* - * We cannot __iget() an inode in state I_CLEAR, I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) - continue; - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) - continue; - - need_iput_tmp = need_iput; - need_iput = NULL; - /* In case inotify_remove_watch_locked() drops a reference. */ - if (inode != need_iput_tmp) - __iget(inode); - else - need_iput_tmp = NULL; - /* In case the dropping of a reference would nuke next_i. */ - if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_CLEAR | I_FREEING | - I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; - } + for_each_possible_cpu(i) { + struct inode *inode, *next_i, *need_iput = NULL; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif + + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inotify_watch *watch, *next_w; + struct inode *need_iput_tmp; + struct list_head *watches; + + spin_lock(&inode->i_lock); + /* + * We cannot __iget() an inode in state I_CLEAR, I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } - /* - * We can safely drop inode_lock here because we hold - * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. - */ - spin_unlock(&inode_lock); - - if (need_iput_tmp) - iput(need_iput_tmp); - - /* for each watch, send IN_UNMOUNT and then remove it */ - mutex_lock(&inode->inotify_mutex); - watches = &inode->inotify_watches; - list_for_each_entry_safe(watch, next_w, watches, i_list) { - struct inotify_handle *ih= watch->ih; - get_inotify_watch(watch); - mutex_lock(&ih->mutex); - ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0, - NULL, NULL); - inotify_remove_watch_locked(ih, watch); - mutex_unlock(&ih->mutex); - put_inotify_watch(watch); - } - mutex_unlock(&inode->inotify_mutex); - iput(inode); + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!inode->i_count) { + spin_unlock(&inode->i_lock); + continue; + } - spin_lock(&inode_lock); + need_iput_tmp = need_iput; + need_iput = NULL; + /* In case inotify_remove_watch_locked() drops a reference. */ + if (inode != need_iput_tmp) { + __iget(inode); + } else + need_iput_tmp = NULL; + + spin_unlock(&inode->i_lock); + + /* In case the dropping of a reference would nuke next_i. */ + if (&next_i->i_sb_list != list) { + spin_lock(&next_i->i_lock); + if (next_i->i_count && + !(next_i->i_state & + (I_CLEAR|I_FREEING|I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); + } + + if (need_iput_tmp) + iput(need_iput_tmp); + + /* for each watch, send IN_UNMOUNT and then remove it */ + mutex_lock(&inode->inotify_mutex); + watches = &inode->inotify_watches; + list_for_each_entry_safe(watch, next_w, watches, i_list) { + struct inotify_handle *ih = watch->ih; + get_inotify_watch(watch); + mutex_lock(&ih->mutex); + ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0, NULL, NULL); + inotify_remove_watch_locked(ih, watch); + mutex_unlock(&ih->mutex); + put_inotify_watch(watch); + } + mutex_unlock(&inode->inotify_mutex); + iput(inode); + } } } EXPORT_SYMBOL_GPL(inotify_unmount_inodes); Index: linux-2.6/fs/smbfs/dir.c =================================================================== --- linux-2.6.orig/fs/smbfs/dir.c +++ linux-2.6/fs/smbfs/dir.c @@ -405,6 +405,7 @@ void smb_renew_times(struct dentry * dentry) { dget(dentry); +again: spin_lock(&dentry->d_lock); for (;;) { struct dentry *parent; @@ -413,8 +414,13 @@ smb_renew_times(struct dentry * dentry) if (IS_ROOT(dentry)) break; parent = dentry->d_parent; - dget(parent); + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } + dget_dlock(parent); spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); dput(dentry); dentry = parent; spin_lock(&dentry->d_lock); Index: linux-2.6/fs/smbfs/proc.c =================================================================== --- linux-2.6.orig/fs/smbfs/proc.c +++ linux-2.6/fs/smbfs/proc.c @@ -332,6 +332,7 @@ static int smb_build_path(struct smb_sb_ * and store it in reversed order [see reverse_string()] */ dget(entry); +again: spin_lock(&entry->d_lock); while (!IS_ROOT(entry)) { struct dentry *parent; @@ -350,6 +351,7 @@ static int smb_build_path(struct smb_sb_ dput(entry); return len; } + reverse_string(path, len); path += len; if (unicode) { @@ -361,7 +363,11 @@ static int smb_build_path(struct smb_sb_ maxlen -= len+1; parent = entry->d_parent; - dget(parent); + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&entry->d_lock); + goto again; + } + dget_dlock(parent); spin_unlock(&entry->d_lock); dput(entry); entry = parent; Index: linux-2.6/kernel/cgroup.c =================================================================== --- linux-2.6.orig/kernel/cgroup.c +++ linux-2.6/kernel/cgroup.c @@ -808,25 +808,29 @@ static void cgroup_clear_directory(struc struct list_head *node; BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(node); if (d->d_inode) { /* This should never be called on a cgroup * directory with child cgroups */ BUG_ON(d->d_inode->i_mode & S_IFDIR); - d = dget_locked(d); - spin_unlock(&dcache_lock); + dget_dlock(d); + spin_unlock(&d->d_lock); + spin_unlock(&dentry->d_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); - spin_lock(&dcache_lock); - } + spin_lock(&dentry->d_lock); + } else + spin_unlock(&d->d_lock); node = dentry->d_subdirs.next; } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); } /* @@ -834,11 +838,16 @@ static void cgroup_clear_directory(struc */ static void cgroup_d_remove_dir(struct dentry *dentry) { + struct dentry *parent; + cgroup_clear_directory(dentry); - spin_lock(&dcache_lock); + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + spin_lock(&dentry->d_lock); list_del_init(&dentry->d_u.d_child); - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); remove_dir(dentry); } @@ -3164,9 +3173,7 @@ again: list_del(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); - spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); - spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); dput(d); Index: linux-2.6/arch/powerpc/platforms/cell/spufs/inode.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/inode.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/inode.c @@ -158,18 +158,18 @@ static void spufs_prune_dir(struct dentr mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_u.d_child) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry)) && dentry->d_inode) { - dget_locked(dentry); + dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); simple_unlink(dir->d_inode, dentry); - spin_unlock(&dcache_lock); + /* XXX: what was dcache_lock protecting here? Other + * filesystems (IB, configfs) release dcache_lock + * before unlink */ dput(dentry); } else { spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); } } shrink_dcache_parent(dir); Index: linux-2.6/drivers/infiniband/hw/ipath/ipath_fs.c =================================================================== --- linux-2.6.orig/drivers/infiniband/hw/ipath/ipath_fs.c +++ linux-2.6/drivers/infiniband/hw/ipath/ipath_fs.c @@ -272,18 +272,14 @@ static int remove_file(struct dentry *pa goto bail; } - spin_lock(&dcache_lock); spin_lock(&tmp->d_lock); if (!(d_unhashed(tmp) && tmp->d_inode)) { - dget_locked(tmp); + dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, tmp); - } else { + } else spin_unlock(&tmp->d_lock); - spin_unlock(&dcache_lock); - } ret = 0; bail: Index: linux-2.6/fs/configfs/inode.c =================================================================== --- linux-2.6.orig/fs/configfs/inode.c +++ linux-2.6/fs/configfs/inode.c @@ -254,18 +254,14 @@ void configfs_drop_dentry(struct configf struct dentry * dentry = sd->s_dentry; if (dentry) { - spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { - dget_locked(dentry); + dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else { + } else spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); - } } } Index: linux-2.6/fs/notify/fsnotify.c =================================================================== --- linux-2.6.orig/fs/notify/fsnotify.c +++ linux-2.6/fs/notify/fsnotify.c @@ -52,7 +52,7 @@ void __fsnotify_update_child_dentry_flag /* determine if the children should tell inode about their events */ watched = fsnotify_inode_watches_children(inode); - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ list_for_each_entry(alias, &inode->i_dentry, d_alias) { @@ -61,19 +61,21 @@ void __fsnotify_update_child_dentry_flag /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ + spin_lock(&alias->d_lock); list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { if (!child->d_inode) continue; - spin_lock(&child->d_lock); + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (watched) child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } + spin_unlock(&alias->d_lock); } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } /* Notify this dentry's parent about a child's events. */ @@ -87,13 +89,18 @@ void __fsnotify_parent(struct dentry *de if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) return; +again: spin_lock(&dentry->d_lock); parent = dentry->d_parent; + if (parent != dentry && !spin_trylock(&parent->d_lock)) { + spin_unlock(&dentry->d_lock); + goto again; + } p_inode = parent->d_inode; if (fsnotify_inode_watches_children(p_inode)) { if (p_inode->i_fsnotify_mask & mask) { - dget(parent); + dget_dlock(parent); send = true; } } else { @@ -103,11 +110,13 @@ void __fsnotify_parent(struct dentry *de * children and update their d_flags to let them know p_inode * doesn't care about them any more. */ - dget(parent); + dget_dlock(parent); should_update_children = true; } spin_unlock(&dentry->d_lock); + if (parent != dentry) + spin_unlock(&parent->d_lock); if (send) { /* we are notifying a parent so come up with the new mask which Index: linux-2.6/fs/sysfs/dir.c =================================================================== --- linux-2.6.orig/fs/sysfs/dir.c +++ linux-2.6/fs/sysfs/dir.c @@ -547,19 +547,21 @@ static void sysfs_drop_dentry(struct sys * dput to immediately free the dentry if it is not in use. */ repeat: - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); list_for_each_entry(dentry, &inode->i_dentry, d_alias) { - if (d_unhashed(dentry)) - continue; - dget_locked(dentry); spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + continue; + } + dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); dput(dentry); goto repeat; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); /* adjust nlink and update timestamp */ mutex_lock(&inode->i_mutex); Index: linux-2.6/fs/seq_file.c =================================================================== --- linux-2.6.orig/fs/seq_file.c +++ linux-2.6/fs/seq_file.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -462,9 +463,10 @@ int seq_path_root(struct seq_file *m, st if (size) { char *p; - spin_lock(&dcache_lock); + vfsmount_read_lock(); p = __d_path(path, root, buf, size); - spin_unlock(&dcache_lock); + vfsmount_read_unlock(); + res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); Index: linux-2.6/fs/configfs/configfs_internal.h =================================================================== --- linux-2.6.orig/fs/configfs/configfs_internal.h +++ linux-2.6/fs/configfs/configfs_internal.h @@ -120,7 +120,7 @@ static inline struct config_item *config { struct config_item * item = NULL; - spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; if (sd->s_type & CONFIGFS_ITEM_LINK) { @@ -129,7 +129,7 @@ static inline struct config_item *config } else item = config_item_get(sd->s_element); } - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return item; } Index: linux-2.6/fs/ocfs2/dcache.c =================================================================== --- linux-2.6.orig/fs/ocfs2/dcache.c +++ linux-2.6/fs/ocfs2/dcache.c @@ -151,23 +151,25 @@ struct dentry *ocfs2_find_local_alias(st struct list_head *p; struct dentry *dentry = NULL; - spin_lock(&dcache_lock); - + spin_lock(&inode->i_lock); list_for_each(p, &inode->i_dentry) { dentry = list_entry(p, struct dentry, d_alias); + spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { mlog(0, "dentry found: %.*s\n", dentry->d_name.len, dentry->d_name.name); - dget_locked(dentry); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); break; } + spin_unlock(&dentry->d_lock); dentry = NULL; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); return dentry; } Index: linux-2.6/drivers/usb/core/inode.c =================================================================== --- linux-2.6.orig/drivers/usb/core/inode.c +++ linux-2.6/drivers/usb/core/inode.c @@ -347,17 +347,16 @@ static int usbfs_empty (struct dentry *d { struct list_head *list; - spin_lock(&dcache_lock); - + spin_lock(&dentry->d_lock); list_for_each(list, &dentry->d_subdirs) { struct dentry *de = list_entry(list, struct dentry, d_u.d_child); if (usbfs_positive(de)) { - spin_unlock(&dcache_lock); + spin_unlock(&dentry->d_lock); return 0; } } + spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); return 1; } Index: linux-2.6/fs/autofs4/inode.c =================================================================== --- linux-2.6.orig/fs/autofs4/inode.c +++ linux-2.6/fs/autofs4/inode.c @@ -109,8 +109,9 @@ static void autofs4_force_release(struct if (!sbi->sb->s_root) return; - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); repeat: + spin_lock(&this_parent->d_lock); next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { @@ -123,33 +124,39 @@ resume: } if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); this_parent = dentry; goto repeat; } next = next->next; - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + spin_unlock(&autofs4_lock); DPRINTK("dentry %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); dput(dentry); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&this_parent->d_lock); } if (this_parent != sbi->sb->s_root) { struct dentry *dentry = this_parent; next = this_parent->d_u.d_child.next; + spin_unlock(&this_parent->d_lock); this_parent = this_parent->d_parent; - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); DPRINTK("parent dentry %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); dput(dentry); - spin_lock(&dcache_lock); + spin_lock(&autofs4_lock); + spin_lock(&this_parent->d_lock); goto resume; } - spin_unlock(&dcache_lock); + spin_unlock(&this_parent->d_lock); + spin_unlock(&autofs4_lock); } void autofs4_kill_sb(struct super_block *sb) Index: linux-2.6/fs/coda/cache.c =================================================================== --- linux-2.6.orig/fs/coda/cache.c +++ linux-2.6/fs/coda/cache.c @@ -86,7 +86,7 @@ static void coda_flag_children(struct de struct list_head *child; struct dentry *de; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); list_for_each(child, &parent->d_subdirs) { de = list_entry(child, struct dentry, d_u.d_child); @@ -95,7 +95,7 @@ static void coda_flag_children(struct de continue; coda_flag_inode(de->d_inode, flag); } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); return; } Index: linux-2.6/fs/ncpfs/dir.c =================================================================== --- linux-2.6.orig/fs/ncpfs/dir.c +++ linux-2.6/fs/ncpfs/dir.c @@ -364,21 +364,21 @@ ncp_dget_fpos(struct dentry *dentry, str } /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dent = list_entry(next, struct dentry, d_u.d_child); if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) - dget_locked(dent); + dget(dent); else dent = NULL; - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); goto out; } next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); return NULL; out: Index: linux-2.6/fs/ncpfs/ncplib_kernel.h =================================================================== --- linux-2.6.orig/fs/ncpfs/ncplib_kernel.h +++ linux-2.6/fs/ncpfs/ncplib_kernel.h @@ -192,7 +192,7 @@ ncp_renew_dentries(struct dentry *parent struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dentry = list_entry(next, struct dentry, d_u.d_child); @@ -204,7 +204,7 @@ ncp_renew_dentries(struct dentry *parent next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } static inline void @@ -214,7 +214,7 @@ ncp_invalidate_dircache_entries(struct d struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dentry = list_entry(next, struct dentry, d_u.d_child); @@ -222,7 +222,7 @@ ncp_invalidate_dircache_entries(struct d ncp_age_dentry(server, dentry); next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } struct ncp_cache_head { Index: linux-2.6/fs/smbfs/cache.c =================================================================== --- linux-2.6.orig/fs/smbfs/cache.c +++ linux-2.6/fs/smbfs/cache.c @@ -62,7 +62,7 @@ smb_invalidate_dircache_entries(struct d struct list_head *next; struct dentry *dentry; - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dentry = list_entry(next, struct dentry, d_u.d_child); @@ -70,7 +70,7 @@ smb_invalidate_dircache_entries(struct d smb_age_dentry(server, dentry); next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } /* @@ -96,13 +96,13 @@ smb_dget_fpos(struct dentry *dentry, str } /* If a pointer is invalid, we search the dentry. */ - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { dent = list_entry(next, struct dentry, d_u.d_child); if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) - dget_locked(dent); + dget(dent); else dent = NULL; goto out_unlock; @@ -111,7 +111,7 @@ smb_dget_fpos(struct dentry *dentry, str } dent = NULL; out_unlock: - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); return dent; } Index: linux-2.6/security/selinux/selinuxfs.c =================================================================== --- linux-2.6.orig/security/selinux/selinuxfs.c +++ linux-2.6/security/selinux/selinuxfs.c @@ -943,24 +943,28 @@ static void sel_remove_entries(struct de { struct list_head *node; - spin_lock(&dcache_lock); + spin_lock(&de->d_lock); node = de->d_subdirs.next; while (node != &de->d_subdirs) { struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(node); if (d->d_inode) { - d = dget_locked(d); - spin_unlock(&dcache_lock); + dget_dlock(d); + spin_unlock(&de->d_lock); + spin_unlock(&d->d_lock); d_delete(d); simple_unlink(de->d_inode, d); dput(d); - spin_lock(&dcache_lock); - } + spin_lock(&de->d_lock); + } else + spin_unlock(&d->d_lock); node = de->d_subdirs.next; } - spin_unlock(&dcache_lock); + spin_unlock(&de->d_lock); } #define BOOL_DIR_NAME "booleans" Index: linux-2.6/fs/affs/amigaffs.c =================================================================== --- linux-2.6.orig/fs/affs/amigaffs.c +++ linux-2.6/fs/affs/amigaffs.c @@ -128,7 +128,7 @@ affs_fix_dcache(struct dentry *dentry, u void *data = dentry->d_fsdata; struct list_head *head, *next; - spin_lock(&dcache_lock); + spin_lock(&inode->i_lock); head = &inode->i_dentry; next = head->next; while (next != head) { @@ -139,7 +139,7 @@ affs_fix_dcache(struct dentry *dentry, u } next = next->next; } - spin_unlock(&dcache_lock); + spin_unlock(&inode->i_lock); } Index: linux-2.6/fs/nfs/getroot.c =================================================================== --- linux-2.6.orig/fs/nfs/getroot.c +++ linux-2.6/fs/nfs/getroot.c @@ -55,7 +55,9 @@ static int nfs_superblock_set_dummy_root return -ENOMEM; } /* Circumvent igrab(): we know the inode is not being freed */ - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); /* * Ensure that this dentry is invisible to d_find_alias(). * Otherwise, it may be spliced into the tree by @@ -64,9 +66,11 @@ static int nfs_superblock_set_dummy_root * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&dcache_lock); + spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&sb->s_root->d_lock); list_del_init(&sb->s_root->d_alias); - spin_unlock(&dcache_lock); + spin_unlock(&sb->s_root->d_lock); + spin_unlock(&sb->s_root->d_inode->i_lock); } return 0; } Index: linux-2.6/drivers/staging/pohmelfs/path_entry.c =================================================================== --- linux-2.6.orig/drivers/staging/pohmelfs/path_entry.c +++ linux-2.6/drivers/staging/pohmelfs/path_entry.c @@ -84,10 +84,11 @@ out: int pohmelfs_path_length(struct pohmelfs_inode *pi) { struct dentry *d, *root, *first; - int len = 1; /* Root slash */ + int len; + unsigned seq; - first = d = d_find_alias(&pi->vfs_inode); - if (!d) { + first = d_find_alias(&pi->vfs_inode); + if (!first) { dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode); return -ENOENT; } @@ -96,7 +97,11 @@ int pohmelfs_path_length(struct pohmelfs root = dget(current->fs->root.dentry); read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); +rename_retry: + len = 1; /* Root slash */ + d = first; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); if (!IS_ROOT(d) && d_unhashed(d)) len += UNHASHED_OBSCURE_STRING_SIZE; /* Obscure " (deleted)" string */ @@ -105,7 +110,9 @@ int pohmelfs_path_length(struct pohmelfs len += d->d_name.len + 1; /* Plus slash */ d = d->d_parent; } - spin_unlock(&dcache_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; dput(root); dput(first); Index: linux-2.6/fs/autofs4/waitq.c =================================================================== --- linux-2.6.orig/fs/autofs4/waitq.c +++ linux-2.6/fs/autofs4/waitq.c @@ -186,16 +186,26 @@ static int autofs4_getpath(struct autofs { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; - char *buf = *name; + char *buf; char *p; - int len = 0; + int len; + unsigned seq; - spin_lock(&dcache_lock); +rename_retry: + buf = *name; + len = 0; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&autofs4_lock); for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; if (!len || --len > NAME_MAX) { - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return 0; } @@ -208,7 +218,10 @@ static int autofs4_getpath(struct autofs p -= tmp->d_name.len; strncpy(p, tmp->d_name.name, tmp->d_name.len); } - spin_unlock(&dcache_lock); + spin_unlock(&autofs4_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; return len; } Index: linux-2.6/fs/nfs/namespace.c =================================================================== --- linux-2.6.orig/fs/nfs/namespace.c +++ linux-2.6/fs/nfs/namespace.c @@ -48,12 +48,17 @@ char *nfs_path(const char *base, const struct dentry *dentry, char *buffer, ssize_t buflen) { - char *end = buffer+buflen; + char *end; int namelen; + unsigned seq; +rename_retry: + end = buffer+buflen; *--end = '\0'; buflen--; - spin_lock(&dcache_lock); + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); while (!IS_ROOT(dentry) && dentry != droot) { namelen = dentry->d_name.len; buflen -= namelen + 1; @@ -64,7 +69,9 @@ char *nfs_path(const char *base, *--end = '/'; dentry = dentry->d_parent; } - spin_unlock(&dcache_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; if (*end != '/') { if (--buflen < 0) goto Elong; @@ -81,7 +88,9 @@ char *nfs_path(const char *base, memcpy(end, base, namelen); return end; Elong_unlock: - spin_unlock(&dcache_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; Elong: return ERR_PTR(-ENAMETOOLONG); } Index: linux-2.6/Documentation/filesystems/Locking =================================================================== --- linux-2.6.orig/Documentation/filesystems/Locking +++ linux-2.6/Documentation/filesystems/Locking @@ -17,7 +17,7 @@ prototypes: void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); -locking rules: +locking rules: XXX: update these!! none have BKL dcache_lock rename_lock ->d_lock may block d_revalidate: no no no yes Index: linux-2.6/include/linux/fsnotify_backend.h =================================================================== --- linux-2.6.orig/include/linux/fsnotify_backend.h +++ linux-2.6/include/linux/fsnotify_backend.h @@ -276,10 +276,10 @@ static inline void __fsnotify_update_dca { struct dentry *parent; - assert_spin_locked(&dcache_lock); assert_spin_locked(&dentry->d_lock); parent = dentry->d_parent; + /* XXX: after dcache_lock removal, there is a race with parent->d_inode and fsnotify_inode_watches_children. must fix */ if (parent->d_inode && fsnotify_inode_watches_children(parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else @@ -288,15 +288,12 @@ static inline void __fsnotify_update_dca /* * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. */ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) { if (!inode) return; - assert_spin_locked(&dcache_lock); - spin_lock(&dentry->d_lock); __fsnotify_update_dcache_flags(dentry); spin_unlock(&dentry->d_lock); @@ -347,7 +344,7 @@ extern void fsnotify_destroy_mark_by_ent extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry); extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry); -extern void fsnotify_unmount_inodes(struct list_head *list); +extern void fsnotify_unmount_inodes(struct super_block *sb); /* put here because inotify does some weird stuff when destroying watches */ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, @@ -377,7 +374,7 @@ static inline u32 fsnotify_get_cookie(vo return 0; } -static inline void fsnotify_unmount_inodes(struct list_head *list) +static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ Index: linux-2.6/fs/autofs4/autofs_i.h =================================================================== --- linux-2.6.orig/fs/autofs4/autofs_i.h +++ linux-2.6/fs/autofs4/autofs_i.h @@ -16,6 +16,7 @@ #include #include #include +#include #include /* This is the range of ioctl() numbers we claim as ours */ @@ -60,6 +61,8 @@ do { \ current->pid, __func__, ##args); \ } while (0) +extern spinlock_t autofs4_lock; + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never Index: linux-2.6/fs/drop_caches.c =================================================================== --- linux-2.6.orig/fs/drop_caches.c +++ linux-2.6/fs/drop_caches.c @@ -14,23 +14,35 @@ int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb) { - struct inode *inode, *toput_inode = NULL; + int i; - spin_lock(&inode_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) - continue; - __iget(inode); - spin_unlock(&inode_lock); - invalidate_mapping_pages(inode->i_mapping, 0, -1); + for_each_possible_cpu(i) { + struct inode *inode, *toput_inode = NULL; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif + rcu_read_lock(); + list_for_each_entry_rcu(inode, list, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW) + || inode->i_mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(toput_inode); + toput_inode = inode; + rcu_read_lock(); + } + rcu_read_unlock(); iput(toput_inode); - toput_inode = inode; - spin_lock(&inode_lock); } - spin_unlock(&inode_lock); - iput(toput_inode); } static void drop_pagecache(void) Index: linux-2.6/fs/fs-writeback.c =================================================================== --- linux-2.6.orig/fs/fs-writeback.c +++ linux-2.6/fs/fs-writeback.c @@ -285,6 +285,7 @@ static void redirty_tail(struct inode *i { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&wb_inode_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -302,13 +303,14 @@ static void requeue_io(struct inode *ino { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&wb_inode_list_lock); list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through spin_unlock(&inode->i_lock); */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); @@ -342,6 +344,7 @@ static void move_expired_inodes(struct l struct inode *inode; int do_sb_sort = 0; + assert_spin_locked(&wb_inode_list_lock); while (!list_empty(delaying_queue)) { inode = list_entry(delaying_queue->prev, struct inode, i_list); if (older_than_this && @@ -397,9 +400,11 @@ static void inode_wait_for_writeback(str wqh = bit_waitqueue(&inode->i_state, __I_SYNC); do { - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); + spin_unlock(&inode->i_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); + spin_lock(&wb_inode_list_lock); } while (inode->i_state & I_SYNC); } @@ -424,7 +429,7 @@ writeback_single_inode(struct inode *ino unsigned dirty; int ret; - if (!atomic_read(&inode->i_count)) + if (!inode->i_count) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); @@ -456,7 +461,8 @@ writeback_single_inode(struct inode *ino inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY; - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); + spin_unlock(&inode->i_lock); ret = do_writepages(mapping, wbc); @@ -473,7 +479,8 @@ writeback_single_inode(struct inode *ino ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); + spin_lock(&wb_inode_list_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) { @@ -532,11 +539,11 @@ select_queue: inode->i_state |= I_DIRTY_PAGES; redirty_tail(inode); } - } else if (atomic_read(&inode->i_count)) { + } else if (inode->i_count) { /* * The inode is clean, inuse */ - list_move(&inode->i_list, &inode_in_use); + list_del_init(&inode->i_list); } else { /* * The inode is clean, unused @@ -617,7 +624,8 @@ static void writeback_inodes_wb(struct b const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); +again: + spin_lock(&wb_inode_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -627,11 +635,17 @@ static void writeback_inodes_wb(struct b struct inode, i_list); long pages_skipped; + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&wb_inode_list_lock); + goto again; + } + /* * super block given and doesn't match, skip this inode */ if (sb && sb != inode->i_sb) { redirty_tail(inode); + spin_unlock(&inode->i_lock); continue; } @@ -642,6 +656,7 @@ static void writeback_inodes_wb(struct b * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode */ + spin_unlock(&inode->i_lock); continue; } /* @@ -649,31 +664,39 @@ static void writeback_inodes_wb(struct b * than the kernel-internal bdev filesystem. Skip the * entire superblock. */ + spin_unlock(&inode->i_lock); break; } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { - requeue_io(inode); - continue; - } - if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!is_blkdev_sb) + if (!is_blkdev_sb) { + spin_unlock(&inode->i_lock); break; /* Skip a congested fs */ + } requeue_io(inode); + spin_unlock(&inode->i_lock); continue; /* Skip a congested blockdev */ } + if (inode->i_state & (I_NEW | I_WILL_FREE)) { + requeue_io(inode); + spin_unlock(&inode->i_lock); + continue; + } + /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, start)) + if (inode_dirtied_after(inode, start)) { + spin_unlock(&inode->i_lock); break; + } if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { requeue_io(inode); + spin_unlock(&inode->i_lock); continue; } @@ -688,10 +711,11 @@ static void writeback_inodes_wb(struct b */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); + spin_unlock(&inode->i_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&wb_inode_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; break; @@ -699,10 +723,9 @@ static void writeback_inodes_wb(struct b if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } + spin_unlock(&wb_inode_list_lock); unpin_sb_for_writeback(&pin_sb); - - spin_unlock(&inode_lock); /* Leave any unwritten inodes on b_io */ } @@ -814,13 +837,19 @@ static long wb_writeback(struct bdi_writ * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); +retry: + spin_lock(&wb_inode_list_lock); if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&wb_inode_list_lock); + goto retry; + } inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); } return wrote; @@ -867,7 +896,7 @@ static long wb_check_old_data_flush(stru wb->last_old_flush = jiffies; nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + get_nr_inodes() - inodes_stat.nr_unused; if (nr_pages) { struct wb_writeback_args args = { @@ -1074,7 +1103,7 @@ void __mark_inode_dirty(struct inode *in if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1115,11 +1144,13 @@ void __mark_inode_dirty(struct inode *in } inode->dirtied_when = jiffies; + spin_lock(&wb_inode_list_lock); list_move(&inode->i_list, &wb->b_dirty); + spin_unlock(&wb_inode_list_lock); } } out: - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1142,7 +1173,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); */ static void wait_sb_inodes(struct super_block *sb) { - struct inode *inode, *old_inode = NULL; + int i; /* * We need to be protected against the filesystem going from @@ -1150,44 +1181,57 @@ static void wait_sb_inodes(struct super_ */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); - - /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. - */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; - - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) - continue; - __iget(inode); - spin_unlock(&inode_lock); + for_each_possible_cpu(i) { + struct inode *inode, *old_inode = NULL; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. */ - iput(old_inode); - old_inode = inode; + rcu_read_lock(); + list_for_each_entry_rcu(inode, list, i_sb_list) { + struct address_space *mapping; - filemap_fdatawait(mapping); + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; - cond_resched(); + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * i_lock. We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it + * under spinlock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; - spin_lock(&inode_lock); + filemap_fdatawait(mapping); + + cond_resched(); + + rcu_read_lock(); + } + rcu_read_unlock(); + iput(old_inode); } - spin_unlock(&inode_lock); - iput(old_inode); } /** @@ -1206,7 +1250,7 @@ void writeback_inodes_sb(struct super_bl long nr_to_write; nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + get_nr_inodes() - inodes_stat.nr_unused; bdi_start_writeback(sb->s_bdi, sb, nr_to_write); } @@ -1250,9 +1294,11 @@ int write_inode_now(struct inode *inode, wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); + spin_lock(&wb_inode_list_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); + spin_unlock(&inode->i_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1274,9 +1320,11 @@ int sync_inode(struct inode *inode, stru { int ret; - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); + spin_lock(&wb_inode_list_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); + spin_unlock(&inode->i_lock); return ret; } EXPORT_SYMBOL(sync_inode); Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c +++ linux-2.6/fs/inode.c @@ -75,9 +75,13 @@ static unsigned int i_hash_shift __read_ * allowing for low-overhead inode sync() operations. */ -LIST_HEAD(inode_in_use); LIST_HEAD(inode_unused); -static struct hlist_head *inode_hashtable __read_mostly; + +struct inode_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; +static struct inode_hash_bucket *inode_hashtable __read_mostly; /* * A simple spinlock to protect the list manipulations. @@ -85,7 +89,8 @@ static struct hlist_head *inode_hashtabl * NOTE! You also have to own the lock if you change * the i_state of an inode while it is in use.. */ -DEFINE_SPINLOCK(inode_lock); +static DEFINE_PER_CPU(spinlock_t, inode_cpulock); +DEFINE_SPINLOCK(wb_inode_list_lock); /* * iprune_sem provides exclusion between the kswapd or try_to_free_pages @@ -104,10 +109,37 @@ static DECLARE_RWSEM(iprune_sem); /* * Statistics gathering.. */ -struct inodes_stat_t inodes_stat; +struct inodes_stat_t inodes_stat = { + .nr_inodes = 0, + .nr_unused = 0, +}; +struct percpu_counter nr_inodes; static struct kmem_cache *inode_cachep __read_mostly; +int get_nr_inodes(void) +{ + return percpu_counter_sum_positive(&nr_inodes); +} + +/* + * Handle nr_dentry sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_inodes(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + return proc_dointvec(table, write, buffer, lenp, ppos); +} +#else +int proc_nr_inodes(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + static void wake_up_inode(struct inode *inode) { /* @@ -135,7 +167,7 @@ int inode_init_always(struct super_block inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; - atomic_set(&inode->i_count, 1); + inode->i_count = 1; inode->i_op = &empty_iops; inode->i_fop = &empty_fops; inode->i_nlink = 1; @@ -247,13 +279,20 @@ void __destroy_inode(struct inode *inode } EXPORT_SYMBOL(__destroy_inode); +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(inode_cachep, inode); +} + void destroy_inode(struct inode *inode) { __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else - kmem_cache_free(inode_cachep, (inode)); + call_rcu(&inode->i_rcu, i_callback); } /* @@ -267,6 +306,7 @@ void inode_init_once(struct inode *inode INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_list); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -292,21 +332,6 @@ static void init_once(void *foo) inode_init_once(inode); } -/* - * inode_lock must be held - */ -void __iget(struct inode *inode) -{ - if (atomic_read(&inode->i_count)) { - atomic_inc(&inode->i_count); - return; - } - atomic_inc(&inode->i_count); - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_in_use); - inodes_stat.nr_unused--; -} - /** * clear_inode - clear an inode * @inode: inode to clear @@ -350,65 +375,70 @@ static void dispose_list(struct list_hea struct inode *inode; inode = list_first_entry(head, struct inode, i_list); - list_del(&inode->i_list); + list_del_init(&inode->i_list); if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode_lock); + spin_lock(&inode->i_lock); + __remove_inode_hash(inode); + inode_sb_list_del(inode); + spin_unlock(&inode->i_lock); wake_up_inode(inode); destroy_inode(inode); nr_disposed++; } - spin_lock(&inode_lock); - inodes_stat.nr_inodes -= nr_disposed; - spin_unlock(&inode_lock); } /* * Invalidate all inodes for a device. */ -static int invalidate_list(struct list_head *head, struct list_head *dispose) +static int invalidate_sb_inodes(struct super_block *sb, struct list_head *dispose) { - struct list_head *next; - int busy = 0, count = 0; - - next = head->next; - for (;;) { - struct list_head *tmp = next; - struct inode *inode; + int busy = 0; + int i; - /* - * We can reschedule here without worrying about the list's - * consistency because the per-sb list of inodes must not - * change during umount anymore, and because iprune_sem keeps - * shrink_icache_memory() away. - */ - cond_resched_lock(&inode_lock); + for_each_possible_cpu(i) { + struct list_head *next; + struct list_head *head; +#ifdef CONFIG_SMP + head = per_cpu_ptr(sb->s_inodes, i); +#else + head = &sb->s_inodes; +#endif - next = next->next; - if (tmp == head) - break; - inode = list_entry(tmp, struct inode, i_sb_list); - if (inode->i_state & I_NEW) - continue; - invalidate_inode_buffers(inode); - if (!atomic_read(&inode->i_count)) { - list_move(&inode->i_list, dispose); - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - count++; - continue; + next = head->next; + for (;;) { + struct list_head *tmp = next; + struct inode *inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_sb_list); + spin_lock(&inode->i_lock); + if (inode->i_state & I_NEW) { + spin_unlock(&inode->i_lock); + continue; + } + invalidate_inode_buffers(inode); + if (!inode->i_count) { + spin_lock(&wb_inode_list_lock); + list_del(&inode->i_list); + inodes_stat.nr_unused--; + spin_unlock(&wb_inode_list_lock); + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); + list_add(&inode->i_list, dispose); + continue; + } + spin_unlock(&inode->i_lock); + busy = 1; } - busy = 1; } - /* only unused inodes may be cached with i_count zero */ - inodes_stat.nr_unused -= count; return busy; } @@ -425,12 +455,17 @@ int invalidate_inodes(struct super_block int busy; LIST_HEAD(throw_away); + /* + * Don't need to worry about the list's consistency because the per-sb + * list of inodes must not change during umount anymore, and because + * iprune_sem keeps shrink_icache_memory() away. + */ down_write(&iprune_sem); - spin_lock(&inode_lock); - inotify_unmount_inodes(&sb->s_inodes); - fsnotify_unmount_inodes(&sb->s_inodes); - busy = invalidate_list(&sb->s_inodes, &throw_away); - spin_unlock(&inode_lock); +// spin_lock(&sb_inode_list_lock); XXX: is this safe? + inotify_unmount_inodes(sb); + fsnotify_unmount_inodes(sb); + busy = invalidate_sb_inodes(sb, &throw_away); +// spin_unlock(&sb_inode_list_lock); dispose_list(&throw_away); up_write(&iprune_sem); @@ -445,7 +480,7 @@ static int can_unuse(struct inode *inode return 0; if (inode_has_buffers(inode)) return 0; - if (atomic_read(&inode->i_count)) + if (inode->i_count) return 0; if (inode->i_data.nrpages) return 0; @@ -468,12 +503,12 @@ static int can_unuse(struct inode *inode static void prune_icache(int nr_to_scan) { LIST_HEAD(freeable); - int nr_pruned = 0; int nr_scanned; unsigned long reap = 0; down_read(&iprune_sem); - spin_lock(&inode_lock); +again: + spin_lock(&wb_inode_list_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -482,36 +517,56 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_unused.prev, struct inode, i_list); - if (inode->i_state || atomic_read(&inode->i_count)) { + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&wb_inode_list_lock); + goto again; + } + if (inode->i_count) { + list_del_init(&inode->i_list); + spin_unlock(&inode->i_lock); + inodes_stat.nr_unused--; + continue; + } + if (inode->i_state) { list_move(&inode->i_list, &inode_unused); + spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { + spin_unlock(&wb_inode_list_lock); __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); - spin_lock(&inode_lock); +again2: + spin_lock(&wb_inode_list_lock); + /* XXX: may no longer work well */ if (inode != list_entry(inode_unused.next, struct inode, i_list)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + if (!spin_trylock(&inode->i_lock)) { + spin_unlock(&wb_inode_list_lock); + goto again2; + } + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } list_move(&inode->i_list, &freeable); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - nr_pruned++; + spin_unlock(&inode->i_lock); + inodes_stat.nr_unused--; } - inodes_stat.nr_unused -= nr_pruned; if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); dispose_list(&freeable); up_read(&iprune_sem); @@ -538,7 +593,7 @@ static int shrink_icache_memory(int nr, return -1; prune_icache(nr); } - return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + return inodes_stat.nr_unused / 100 * sysctl_vfs_cache_pressure; } static struct shrinker icache_shrinker = { @@ -554,7 +609,7 @@ static void __wait_on_freeing_inode(stru * add any additional branch in the common code. */ static struct inode *find_inode(struct super_block *sb, - struct hlist_head *head, + struct inode_hash_bucket *b, int (*test)(struct inode *, void *), void *data) { @@ -562,17 +617,27 @@ static struct inode *find_inode(struct s struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + rcu_read_lock(); + hlist_for_each_entry_rcu(inode, node, &b->head, i_hash) { if (inode->i_sb != sb) continue; - if (!test(inode, data)) + spin_lock(&inode->i_lock); + if (hlist_unhashed(&inode->i_hash)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { + rcu_read_unlock(); __wait_on_freeing_inode(inode); goto repeat; } break; } + rcu_read_unlock(); return node ? inode : NULL; } @@ -581,23 +646,32 @@ repeat: * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct inode_hash_bucket *b, + unsigned long ino) { struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { + rcu_read_lock(); + hlist_for_each_entry_rcu(inode, node, &b->head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; + spin_lock(&inode->i_lock); + if (hlist_unhashed(&inode->i_hash)) { + spin_unlock(&inode->i_lock); + continue; + } if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { + rcu_read_unlock(); __wait_on_freeing_inode(inode); goto repeat; } break; } + rcu_read_unlock(); return node ? inode : NULL; } @@ -611,17 +685,89 @@ static unsigned long hash(struct super_b return tmp & I_HASHMASK; } +static void inode_sb_list_add(struct inode *inode, struct super_block *sb) +{ + spinlock_t *lock; + struct list_head *list; +#ifdef CONFIG_SMP + int cpu; +#endif + + lock = &get_cpu_var(inode_cpulock); +#ifdef CONFIG_SMP + cpu = smp_processor_id(); + list = per_cpu_ptr(sb->s_inodes, cpu); + inode->i_sb_list_cpu = cpu; +#else + list = &sb->s_files; +#endif + spin_lock(lock); + list_add_rcu(&inode->i_sb_list, list); + spin_unlock(lock); + put_cpu_var(inode_cpulock); +} + +void inode_sb_list_del(struct inode *inode) +{ + spinlock_t *lock; + +#ifdef CONFIG_SMP + lock = &per_cpu(inode_cpulock, inode->i_sb_list_cpu); +#else + lock = &__get_cpu_var(inode_cpulock); +#endif + spin_lock(lock); + list_del_rcu(&inode->i_sb_list); + spin_unlock(lock); +} + static inline void -__inode_add_to_lists(struct super_block *sb, struct hlist_head *head, +__inode_add_to_lists(struct super_block *sb, struct inode_hash_bucket *b, struct inode *inode) { - inodes_stat.nr_inodes++; - list_add(&inode->i_list, &inode_in_use); - list_add(&inode->i_sb_list, &sb->s_inodes); - if (head) - hlist_add_head(&inode->i_hash, head); + inode_sb_list_add(inode, sb); + percpu_counter_inc(&nr_inodes); + if (b) { + spin_lock(&b->lock); + hlist_add_head(&inode->i_hash, &b->head); + spin_unlock(&b->lock); + } } +#ifdef CONFIG_SMP +/* + * Each cpu owns a range of 1024 numbers. + * 'shared_last_ino' is dirtied only once out of 1024 allocations, + * to renew the exhausted range. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ +static DEFINE_PER_CPU(int, last_ino); +static atomic_t shared_last_ino; + +static int last_ino_get(void) +{ + int *p = &get_cpu_var(last_ino); + int res = *p; + + if (unlikely((res & 1023) == 0)) + res = atomic_add_return(1024, &shared_last_ino) - 1024; + + *p = ++res; + put_cpu_var(last_ino); + return res; +} +#else +static int last_ino_get(void) +{ + static int last_ino; + + return ++last_ino; +} +#endif + /** * inode_add_to_lists - add a new inode to relevant lists * @sb: superblock inode belongs to @@ -636,11 +782,11 @@ __inode_add_to_lists(struct super_block */ void inode_add_to_lists(struct super_block *sb, struct inode *inode) { - struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, inode->i_ino); - spin_lock(&inode_lock); - __inode_add_to_lists(sb, head, inode); - spin_unlock(&inode_lock); + spin_lock(&inode->i_lock); + __inode_add_to_lists(sb, b, inode); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(inode_add_to_lists); @@ -658,23 +804,15 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists); */ struct inode *new_inode(struct super_block *sb) { - /* - * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW - * error if st_ino won't fit in target struct field. Use 32bit counter - * here to attempt to avoid that. - */ - static unsigned int last_ino; struct inode *inode; - spin_lock_prefetch(&inode_lock); - inode = alloc_inode(sb); if (inode) { - spin_lock(&inode_lock); - __inode_add_to_lists(sb, NULL, inode); - inode->i_ino = ++last_ino; + spin_lock(&inode->i_lock); + inode->i_ino = last_ino_get(); inode->i_state = 0; - spin_unlock(&inode_lock); + __inode_add_to_lists(sb, NULL, inode); + spin_unlock(&inode->i_lock); } return inode; } @@ -722,7 +860,7 @@ EXPORT_SYMBOL(unlock_new_inode); * -- rmk@arm.uk.linux.org */ static struct inode *get_new_inode(struct super_block *sb, - struct hlist_head *head, + struct inode_hash_bucket *b, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) @@ -733,16 +871,16 @@ static struct inode *get_new_inode(struc if (inode) { struct inode *old; - spin_lock(&inode_lock); /* We released the lock, so.. */ - old = find_inode(sb, head, test, data); + old = find_inode(sb, b, test, data); if (!old) { + spin_lock(&inode->i_lock); if (set(inode, data)) goto set_failed; - __inode_add_to_lists(sb, head, inode); inode->i_state = I_LOCK|I_NEW; - spin_unlock(&inode_lock); + __inode_add_to_lists(sb, b, inode); + spin_unlock(&inode->i_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -756,7 +894,7 @@ static struct inode *get_new_inode(struc * allocated. */ __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -764,7 +902,7 @@ static struct inode *get_new_inode(struc return inode; set_failed: - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); destroy_inode(inode); return NULL; } @@ -774,7 +912,7 @@ set_failed: * comment at iget_locked for details. */ static struct inode *get_new_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct inode_hash_bucket *b, unsigned long ino) { struct inode *inode; @@ -782,14 +920,14 @@ static struct inode *get_new_inode_fast( if (inode) { struct inode *old; - spin_lock(&inode_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino); + old = find_inode_fast(sb, b, ino); if (!old) { + spin_lock(&inode->i_lock); inode->i_ino = ino; - __inode_add_to_lists(sb, head, inode); inode->i_state = I_LOCK|I_NEW; - spin_unlock(&inode_lock); + __inode_add_to_lists(sb, b, inode); + spin_unlock(&inode->i_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -803,7 +941,7 @@ static struct inode *get_new_inode_fast( * allocated. */ __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -811,6 +949,23 @@ static struct inode *get_new_inode_fast( return inode; } +static int test_inode_iunique(struct super_block *sb, + struct inode_hash_bucket *b, unsigned long ino) +{ + struct hlist_node *node; + struct inode *inode = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(inode, node, &b->head, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) { + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + return 1; +} + /** * iunique - get a unique inode number * @sb: superblock @@ -832,20 +987,19 @@ ino_t iunique(struct super_block *sb, in * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ + static DEFINE_SPINLOCK(unique_lock); static unsigned int counter; - struct inode *inode; - struct hlist_head *head; + struct inode_hash_bucket *b; ino_t res; - spin_lock(&inode_lock); + spin_lock(&unique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; - head = inode_hashtable + hash(sb, res); - inode = find_inode_fast(sb, head, res); - } while (inode != NULL); - spin_unlock(&inode_lock); + b = inode_hashtable + hash(sb, res); + } while (!test_inode_iunique(sb, b, res)); + spin_unlock(&unique_lock); return res; } @@ -853,7 +1007,9 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { - spin_lock(&inode_lock); + struct inode *ret = inode; + + spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) __iget(inode); else @@ -862,9 +1018,10 @@ struct inode *igrab(struct inode *inode) * called yet, and somebody is calling igrab * while the inode is getting freed. */ - inode = NULL; - spin_unlock(&inode_lock); - return inode; + ret = NULL; + spin_unlock(&inode->i_lock); + + return ret; } EXPORT_SYMBOL(igrab); @@ -888,21 +1045,20 @@ EXPORT_SYMBOL(igrab); * Note, @test is called with the inode_lock held, so can't sleep. */ static struct inode *ifind(struct super_block *sb, - struct hlist_head *head, int (*test)(struct inode *, void *), + struct inode_hash_bucket *b, + int (*test)(struct inode *, void *), void *data, const int wait) { struct inode *inode; - spin_lock(&inode_lock); - inode = find_inode(sb, head, test, data); + inode = find_inode(sb, b, test, data); if (inode) { __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); if (likely(wait)) wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); return NULL; } @@ -922,19 +1078,18 @@ static struct inode *ifind(struct super_ * Otherwise NULL is returned. */ static struct inode *ifind_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct inode_hash_bucket *b, + unsigned long ino) { struct inode *inode; - spin_lock(&inode_lock); - inode = find_inode_fast(sb, head, ino); + inode = find_inode_fast(sb, b, ino); if (inode) { __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); return NULL; } @@ -962,9 +1117,9 @@ static struct inode *ifind_fast(struct s struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, hashval); - return ifind(sb, head, test, data, 0); + return ifind(sb, b, test, data, 0); } EXPORT_SYMBOL(ilookup5_nowait); @@ -990,9 +1145,9 @@ EXPORT_SYMBOL(ilookup5_nowait); struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, hashval); - return ifind(sb, head, test, data, 1); + return ifind(sb, b, test, data, 1); } EXPORT_SYMBOL(ilookup5); @@ -1012,9 +1167,9 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, ino); - return ifind_fast(sb, head, ino); + return ifind_fast(sb, b, ino); } EXPORT_SYMBOL(ilookup); @@ -1042,17 +1197,17 @@ struct inode *iget5_locked(struct super_ int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, hashval); struct inode *inode; - inode = ifind(sb, head, test, data, 1); + inode = ifind(sb, b, test, data, 1); if (inode) return inode; /* * get_new_inode() will do the right thing, re-trying the search * in case it had to block at any point. */ - return get_new_inode(sb, head, test, set, data); + return get_new_inode(sb, b, test, set, data); } EXPORT_SYMBOL(iget5_locked); @@ -1073,17 +1228,17 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, ino); struct inode *inode; - inode = ifind_fast(sb, head, ino); + inode = ifind_fast(sb, b, ino); if (inode) return inode; /* * get_new_inode_fast() will do the right thing, re-trying the search * in case it had to block at any point. */ - return get_new_inode_fast(sb, head, ino); + return get_new_inode_fast(sb, b, ino); } EXPORT_SYMBOL(iget_locked); @@ -1091,29 +1246,37 @@ int insert_inode_locked(struct inode *in { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, ino); inode->i_state |= I_LOCK|I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); - hlist_for_each_entry(old, node, head, i_hash) { + +repeat: + spin_lock(&b->lock); + hlist_for_each_entry(old, node, &b->head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) continue; + if (!spin_trylock(&old->i_lock)) { + spin_unlock(&b->lock); + goto repeat; + } break; } if (likely(!node)) { - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + /* XXX: initialize inode->i_lock to locked? */ + hlist_add_head(&inode->i_hash, &b->head); + spin_unlock(&b->lock); return 0; } + spin_unlock(&b->lock); __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); wait_on_inode(old); if (unlikely(!hlist_unhashed(&old->i_hash))) { iput(old); @@ -1128,7 +1291,7 @@ int insert_inode_locked4(struct inode *i int (*test)(struct inode *, void *), void *data) { struct super_block *sb = inode->i_sb; - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode_hash_bucket *b = inode_hashtable + hash(sb, hashval); inode->i_state |= I_LOCK|I_NEW; @@ -1136,23 +1299,30 @@ int insert_inode_locked4(struct inode *i struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); - hlist_for_each_entry(old, node, head, i_hash) { +repeat: + spin_lock(&b->lock); + hlist_for_each_entry(old, node, &b->head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) continue; if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) continue; + if (!spin_trylock(&old->i_lock)) { + spin_unlock(&b->lock); + goto repeat; + } break; } if (likely(!node)) { - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + /* XXX: initialize inode->i_lock to locked? */ + hlist_add_head(&inode->i_hash, &b->head); + spin_unlock(&b->lock); return 0; } + spin_unlock(&b->lock); __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); wait_on_inode(old); if (unlikely(!hlist_unhashed(&old->i_hash))) { iput(old); @@ -1173,14 +1343,32 @@ EXPORT_SYMBOL(insert_inode_locked4); */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + struct inode_hash_bucket *b = inode_hashtable + hash(inode->i_sb, hashval); + + spin_lock(&inode->i_lock); + spin_lock(&b->lock); + hlist_add_head(&inode->i_hash, &b->head); + spin_unlock(&b->lock); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. inode->i_lock must be + * held. + */ +void __remove_inode_hash(struct inode *inode) +{ + struct inode_hash_bucket *b = inode_hashtable + hash(inode->i_sb, inode->i_ino); + spin_lock(&b->lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&b->lock); +} + +/** * remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * @@ -1188,9 +1376,9 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void remove_inode_hash(struct inode *inode) { - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + spin_lock(&inode->i_lock); + __remove_inode_hash(inode); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -1210,12 +1398,16 @@ void generic_delete_inode(struct inode * { const struct super_operations *op = inode->i_sb->s_op; - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); + if (!list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_del_init(&inode->i_list); + spin_unlock(&wb_inode_list_lock); + } + inode_sb_list_del(inode); + percpu_counter_dec(&nr_inodes); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); security_inode_delete(inode); @@ -1232,9 +1424,15 @@ void generic_delete_inode(struct inode * truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); } - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + /* + * i_lock not required to delete from hash. If there was a + * concurrency window, then it would be possible for the other + * thread to touch the inode after it has been freed, with + * destroy_inode. + * XXX: yes it is because find_inode_fast checks it. Maybe we + * can avoid it though... + */ + remove_inode_hash(inode); wake_up_inode(inode); BUG_ON(inode->i_state != I_CLEAR); destroy_inode(inode); @@ -1255,29 +1453,36 @@ int generic_detach_inode(struct inode *i struct super_block *sb = inode->i_sb; if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; + if (list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_add(&inode->i_list, &inode_unused); + inodes_stat.nr_unused++; + spin_unlock(&wb_inode_list_lock); + } if (sb->s_flags & MS_ACTIVE) { - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); return 0; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); write_inode_now(inode, 1); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; + __remove_inode_hash(inode); + } + if (!list_empty(&inode->i_list)) { + spin_lock(&wb_inode_list_lock); + list_del_init(&inode->i_list); inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + spin_unlock(&wb_inode_list_lock); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); + inode_sb_list_del(inode); + percpu_counter_dec(&nr_inodes); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); return 1; } EXPORT_SYMBOL_GPL(generic_detach_inode); @@ -1342,8 +1547,12 @@ void iput(struct inode *inode) if (inode) { BUG_ON(inode->i_state == I_CLEAR); - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + spin_lock(&inode->i_lock); + inode->i_count--; + if (inode->i_count == 0) iput_final(inode); + else + spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(iput); @@ -1524,6 +1733,8 @@ EXPORT_SYMBOL(inode_wait); * wake_up_inode() after removing from the hash list will DTRT. * * This is called with inode_lock held. + * + * Called with i_lock held and returns with it dropped. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1531,10 +1742,9 @@ static void __wait_on_freeing_inode(stru DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); wq = bit_waitqueue(&inode->i_state, __I_LOCK); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); schedule(); finish_wait(wq, &wait.wait); - spin_lock(&inode_lock); } static __initdata unsigned long ihash_entries; @@ -1562,7 +1772,7 @@ void __init inode_init_early(void) inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct inode_hash_bucket), ihash_entries, 14, HASH_EARLY, @@ -1570,14 +1780,17 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); + for (loop = 0; loop < (1 << i_hash_shift); loop++) { + spin_lock_init(&inode_hashtable[loop].lock); + INIT_HLIST_HEAD(&inode_hashtable[loop].head); + } } void __init inode_init(void) { int loop; + percpu_counter_init(&nr_inodes, 0); /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1587,13 +1800,17 @@ void __init inode_init(void) init_once); register_shrinker(&icache_shrinker); + for_each_possible_cpu(loop) { + spin_lock_init(&per_cpu(inode_cpulock, loop)); + } + /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct inode_hash_bucket), ihash_entries, 14, 0, @@ -1601,8 +1818,10 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); + for (loop = 0; loop < (1 << i_hash_shift); loop++) { + spin_lock_init(&inode_hashtable[loop].lock); + INIT_HLIST_HEAD(&inode_hashtable[loop].head); + } } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) Index: linux-2.6/fs/quota/dquot.c =================================================================== --- linux-2.6.orig/fs/quota/dquot.c +++ linux-2.6/fs/quota/dquot.c @@ -819,32 +819,49 @@ static int dqinit_needed(struct inode *i /* This routine is guarded by dqonoff_mutex mutex */ static void add_dquot_ref(struct super_block *sb, int type) { - struct inode *inode, *old_inode = NULL; - - spin_lock(&inode_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - if (!atomic_read(&inode->i_writecount)) - continue; - if (!dqinit_needed(inode, type)) - continue; - - __iget(inode); - spin_unlock(&inode_lock); + int i; + for_each_possible_cpu(i) { + struct inode *inode, *old_inode = NULL; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif + rcu_read_lock(); + list_for_each_entry_rcu(inode, list, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!atomic_read(&inode->i_writecount)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); + continue; + } + + __iget(inode); + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + + iput(old_inode); + sb->dq_op->initialize(inode, type); + /* We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the inode_lock. + * We cannot iput the inode now as we can be holding the last + * reference and we cannot iput it under inode_lock. So we + * keep the reference and iput it later. */ + old_inode = inode; + rcu_read_lock(); + } + rcu_read_unlock(); iput(old_inode); - sb->dq_op->initialize(inode, type); - /* We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the inode_lock. - * We cannot iput the inode now as we can be holding the last - * reference and we cannot iput it under inode_lock. So we - * keep the reference and iput it later. */ - old_inode = inode; - spin_lock(&inode_lock); } - spin_unlock(&inode_lock); - iput(old_inode); } /* @@ -911,20 +928,29 @@ static void put_dquot_list(struct list_h static void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { - struct inode *inode; + int i; + for_each_possible_cpu(i) { + struct inode *inode; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif - spin_lock(&inode_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We have to scan also I_NEW inodes because they can already - * have quota pointer initialized. Luckily, we need to touch - * only quota pointers and these have separate locking - * (dqptr_sem). - */ - if (!IS_NOQUOTA(inode)) - remove_inode_dquot_ref(inode, type, tofree_head); + rcu_read_lock(); + list_for_each_entry_rcu(inode, list, i_sb_list) { + /* + * We have to scan also I_NEW inodes because they can already + * have quota pointer initialized. Luckily, we need to touch + * only quota pointers and these have separate locking + * (dqptr_sem). + */ + if (!IS_NOQUOTA(inode)) + remove_inode_dquot_ref(inode, type, tofree_head); + } + rcu_read_unlock(); } - spin_unlock(&inode_lock); } /* Gather all references from inodes and drop them */ Index: linux-2.6/include/linux/writeback.h =================================================================== --- linux-2.6.orig/include/linux/writeback.h +++ linux-2.6/include/linux/writeback.h @@ -9,8 +9,8 @@ struct backing_dev_info; -extern spinlock_t inode_lock; -extern struct list_head inode_in_use; +extern spinlock_t sb_inode_list_lock; +extern spinlock_t wb_inode_list_lock; extern struct list_head inode_unused; /* Index: linux-2.6/fs/notify/inode_mark.c =================================================================== --- linux-2.6.orig/fs/notify/inode_mark.c +++ linux-2.6/fs/notify/inode_mark.c @@ -362,65 +362,75 @@ int fsnotify_add_mark(struct fsnotify_ma * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. * We temporarily drop inode_lock, however, and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { - struct inode *inode, *next_i, *need_iput = NULL; + int i; - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { - struct inode *need_iput_tmp; + for_each_possible_cpu(i) { + struct inode *inode, *next_i, *need_iput = NULL; + struct list_head *list; +#ifdef CONFIG_SMP + list = per_cpu_ptr(sb->s_inodes, i); +#else + list = &sb->s_inodes; +#endif + + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inode *need_iput_tmp; + + spin_lock(&inode->i_lock); + /* + * We cannot __iget() an inode in state I_CLEAR, I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!inode->i_count) { + spin_unlock(&inode->i_lock); + continue; + } + + need_iput_tmp = need_iput; + need_iput = NULL; + + /* In case fsnotify_inode_delete() drops a reference. */ + if (inode != need_iput_tmp) { + __iget(inode); + } else + need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); + + /* In case the dropping of a reference would nuke next_i. */ + if (&next_i->i_sb_list != list) { + spin_lock(&next_i->i_lock); + if (next_i->i_count && + !(next_i->i_state & + (I_CLEAR | I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); + } + + if (need_iput_tmp) + iput(need_iput_tmp); - /* - * We cannot __iget() an inode in state I_CLEAR, I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) - continue; - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) - continue; - - need_iput_tmp = need_iput; - need_iput = NULL; - - /* In case fsnotify_inode_delete() drops a reference. */ - if (inode != need_iput_tmp) - __iget(inode); - else - need_iput_tmp = NULL; - - /* In case the dropping of a reference would nuke next_i. */ - if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; - } - - /* - * We can safely drop inode_lock here because we hold - * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. - */ - spin_unlock(&inode_lock); - - if (need_iput_tmp) - iput(need_iput_tmp); + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + fsnotify_inode_delete(inode); - fsnotify_inode_delete(inode); - - iput(inode); - - spin_lock(&inode_lock); + iput(inode); + } } } Index: linux-2.6/fs/nilfs2/gcdat.c =================================================================== --- linux-2.6.orig/fs/nilfs2/gcdat.c +++ linux-2.6/fs/nilfs2/gcdat.c @@ -27,6 +27,7 @@ #include "page.h" #include "mdt.h" +/* XXX: what protects i_state? */ int nilfs_init_gcdat_inode(struct the_nilfs *nilfs) { struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/file.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/file.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/file.c @@ -1548,7 +1548,7 @@ static int spufs_mfc_open(struct inode * if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (inode->i_count != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); Index: linux-2.6/fs/affs/inode.c =================================================================== --- linux-2.6.orig/fs/affs/inode.c +++ linux-2.6/fs/affs/inode.c @@ -379,7 +379,9 @@ affs_add_entry(struct inode *dir, struct affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain)); mark_buffer_dirty_inode(inode_bh, inode); inode->i_nlink = 2; - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); } affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); Index: linux-2.6/fs/afs/dir.c =================================================================== --- linux-2.6.orig/fs/afs/dir.c +++ linux-2.6/fs/afs/dir.c @@ -1007,7 +1007,9 @@ static int afs_link(struct dentry *from, if (ret < 0) goto link_error; - atomic_inc(&vnode->vfs_inode.i_count); + spin_lock(&vnode->vfs_inode.i_lock); + vnode->vfs_inode.i_count++; + spin_unlock(&vnode->vfs_inode.i_lock); d_instantiate(dentry, &vnode->vfs_inode); key_put(key); _leave(" = 0"); Index: linux-2.6/fs/block_dev.c =================================================================== --- linux-2.6.orig/fs/block_dev.c +++ linux-2.6/fs/block_dev.c @@ -423,13 +423,20 @@ static struct inode *bdev_alloc_inode(st return &ei->vfs_inode; } -static void bdev_destroy_inode(struct inode *inode) +static void bdev_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(bdev_cachep, bdi); } +static void bdev_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bdev_i_callback); +} + static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; @@ -575,7 +582,12 @@ EXPORT_SYMBOL(bdget); */ struct block_device *bdgrab(struct block_device *bdev) { - atomic_inc(&bdev->bd_inode->i_count); + struct inode *inode = bdev->bd_inode; + + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); + return bdev; } @@ -605,7 +617,9 @@ static struct block_device *bd_acquire(s spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - atomic_inc(&bdev->bd_inode->i_count); + spin_lock(&inode->i_lock); + bdev->bd_inode->i_count++; + spin_unlock(&inode->i_lock); spin_unlock(&bdev_lock); return bdev; } @@ -621,7 +635,9 @@ static struct block_device *bd_acquire(s * So, we can access it via ->i_mapping always * without igrab(). */ - atomic_inc(&bdev->bd_inode->i_count); + spin_lock(&inode->i_lock); + bdev->bd_inode->i_count++; + spin_unlock(&inode->i_lock); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); Index: linux-2.6/fs/ext2/namei.c =================================================================== --- linux-2.6.orig/fs/ext2/namei.c +++ linux-2.6/fs/ext2/namei.c @@ -196,7 +196,9 @@ static int ext2_link (struct dentry * ol inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); err = ext2_add_link(dentry, inode); if (!err) { Index: linux-2.6/fs/ext3/ialloc.c =================================================================== --- linux-2.6.orig/fs/ext3/ialloc.c +++ linux-2.6/fs/ext3/ialloc.c @@ -100,9 +100,9 @@ void ext3_free_inode (handle_t *handle, struct ext3_sb_info *sbi; int fatal = 0, err; - if (atomic_read(&inode->i_count) > 1) { + if (inode->i_count > 1) { printk ("ext3_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + inode->i_count); return; } if (inode->i_nlink) { Index: linux-2.6/fs/ext3/namei.c =================================================================== --- linux-2.6.orig/fs/ext3/namei.c +++ linux-2.6/fs/ext3/namei.c @@ -2244,7 +2244,9 @@ retry: inode->i_ctime = CURRENT_TIME_SEC; inc_nlink(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); err = ext3_add_entry(handle, dentry, inode); if (!err) { Index: linux-2.6/fs/xfs/linux-2.6/xfs_iops.c =================================================================== --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_iops.c +++ linux-2.6/fs/xfs/linux-2.6/xfs_iops.c @@ -357,7 +357,9 @@ xfs_vn_link( if (unlikely(error)) return -error; - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(dentry, inode); return 0; } Index: linux-2.6/fs/xfs/xfs_iget.c =================================================================== --- linux-2.6.orig/fs/xfs/xfs_iget.c +++ linux-2.6/fs/xfs/xfs_iget.c @@ -800,7 +800,7 @@ xfs_isilocked( /* 0 */ (void *)(__psint_t)(vk), \ /* 1 */ (void *)(s), \ /* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \ +/* 3 */ (void *)(__psint_t)&VFS_I(ip)->i_count, \ /* 4 */ (void *)(ra), \ /* 5 */ NULL, \ /* 6 */ (void *)(__psint_t)current_cpu(), \ Index: linux-2.6/fs/xfs/xfs_inode.h =================================================================== --- linux-2.6.orig/fs/xfs/xfs_inode.h +++ linux-2.6/fs/xfs/xfs_inode.h @@ -541,8 +541,10 @@ extern void xfs_itrace_rele(struct xfs_i #define IHOLD(ip) \ do { \ - ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ - atomic_inc(&(VFS_I(ip)->i_count)); \ + spin_lock(&VFS_I(ip)->i_lock); \ + ASSERT(&VFS_I(ip)->i_count > 0); \ + VFS_I(ip)->i_count++; \ + spin_unlock(&VFS_I(ip)->i_lock); \ xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \ } while (0) Index: linux-2.6/ipc/mqueue.c =================================================================== --- linux-2.6.orig/ipc/mqueue.c +++ linux-2.6/ipc/mqueue.c @@ -238,11 +238,18 @@ static struct inode *mqueue_alloc_inode( return &ei->vfs_inode; } -static void mqueue_destroy_inode(struct inode *inode) +static void mqueue_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } +static void mqueue_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, mqueue_i_callback); +} + static void mqueue_delete_inode(struct inode *inode) { struct mqueue_inode_info *info; @@ -779,8 +786,11 @@ SYSCALL_DEFINE1(mq_unlink, const char __ } inode = dentry->d_inode; - if (inode) - atomic_inc(&inode->i_count); + if (inode) { + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); + } err = mnt_want_write(ipc_ns->mq_mnt); if (err) goto out_err; Index: linux-2.6/kernel/futex.c =================================================================== --- linux-2.6.orig/kernel/futex.c +++ linux-2.6/kernel/futex.c @@ -167,7 +167,9 @@ static void get_futex_key_refs(union fut switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); + spin_lock(&key->shared.inode->i_lock); + key->shared.inode->i_count++; + spin_unlock(&key->shared.inode->i_lock); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c +++ linux-2.6/mm/shmem.c @@ -1871,7 +1871,9 @@ static int shmem_link(struct dentry *old dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); /* New dentry reference */ + spin_lock(&inode->i_lock); + inode->i_count++; /* New dentry reference */ + spin_unlock(&inode->i_lock); dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: @@ -2383,13 +2385,20 @@ static struct inode *shmem_alloc_inode(s return &p->vfs_inode; } +static void shmem_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); + call_rcu(&inode->i_rcu, shmem_i_callback); } static void init_once(void *foo) Index: linux-2.6/fs/bfs/dir.c =================================================================== --- linux-2.6.orig/fs/bfs/dir.c +++ linux-2.6/fs/bfs/dir.c @@ -178,7 +178,9 @@ static int bfs_link(struct dentry *old, inc_nlink(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; Index: linux-2.6/fs/btrfs/inode.c =================================================================== --- linux-2.6.orig/fs/btrfs/inode.c +++ linux-2.6/fs/btrfs/inode.c @@ -4247,7 +4247,9 @@ static int btrfs_link(struct dentry *old trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); err = btrfs_add_nondir(trans, dentry, inode, 1, index); Index: linux-2.6/fs/exofs/inode.c =================================================================== --- linux-2.6.orig/fs/exofs/inode.c +++ linux-2.6/fs/exofs/inode.c @@ -1038,7 +1038,9 @@ static void create_done(struct osd_reque } else set_obj_created(oi); - atomic_dec(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count--; + spin_unlock(&inode->i_lock); wake_up(&oi->i_wq); } @@ -1104,11 +1106,15 @@ struct inode *exofs_new_inode(struct ino /* increment the refcount so that the inode will still be around when we * reach the callback */ - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); ret = exofs_async_op(or, create_done, inode, oi->i_cred); if (ret) { - atomic_dec(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count--; + spin_unlock(&inode->i_lock); osd_end_request(or); return ERR_PTR(-EIO); } Index: linux-2.6/fs/exofs/namei.c =================================================================== --- linux-2.6.orig/fs/exofs/namei.c +++ linux-2.6/fs/exofs/namei.c @@ -153,7 +153,9 @@ static int exofs_link(struct dentry *old inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); return exofs_add_nondir(dentry, inode); } Index: linux-2.6/fs/ext4/ialloc.c =================================================================== --- linux-2.6.orig/fs/ext4/ialloc.c +++ linux-2.6/fs/ext4/ialloc.c @@ -192,9 +192,9 @@ void ext4_free_inode(handle_t *handle, s struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - if (atomic_read(&inode->i_count) > 1) { + if (inode->i_count > 1) { printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + inode->i_count); return; } if (inode->i_nlink) { Index: linux-2.6/fs/ext4/namei.c =================================================================== --- linux-2.6.orig/fs/ext4/namei.c +++ linux-2.6/fs/ext4/namei.c @@ -2340,7 +2340,9 @@ retry: inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); err = ext4_add_entry(handle, dentry, inode); if (!err) { Index: linux-2.6/fs/gfs2/ops_inode.c =================================================================== --- linux-2.6.orig/fs/gfs2/ops_inode.c +++ linux-2.6/fs/gfs2/ops_inode.c @@ -253,7 +253,9 @@ out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); if (!error) { - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(dentry, inode); mark_inode_dirty(inode); } Index: linux-2.6/fs/hfsplus/dir.c =================================================================== --- linux-2.6.orig/fs/hfsplus/dir.c +++ linux-2.6/fs/hfsplus/dir.c @@ -301,7 +301,9 @@ static int hfsplus_link(struct dentry *s inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); HFSPLUS_SB(sb).file_count++; Index: linux-2.6/fs/hpfs/inode.c =================================================================== --- linux-2.6.orig/fs/hpfs/inode.c +++ linux-2.6/fs/hpfs/inode.c @@ -182,7 +182,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !i->i_count) { if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); hpfs_inode->i_rddir_off = NULL; Index: linux-2.6/fs/jffs2/dir.c =================================================================== --- linux-2.6.orig/fs/jffs2/dir.c +++ linux-2.6/fs/jffs2/dir.c @@ -287,7 +287,9 @@ static int jffs2_link (struct dentry *ol mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - atomic_inc(&old_dentry->d_inode->i_count); + spin_lock(&old_dentry->d_inode->i_lock); + old_dentry->d_inode->i_count++; + spin_unlock(&old_dentry->d_inode->i_lock); } return ret; } @@ -866,7 +868,9 @@ static int jffs2_rename (struct inode *o printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); /* Might as well let the VFS know */ d_instantiate(new_dentry, old_dentry->d_inode); - atomic_inc(&old_dentry->d_inode->i_count); + spin_lock(&old_dentry->d_inode->i_lock); + old_dentry->d_inode->i_count++; + spin_unlock(&old_dentry->d_inode->i_lock); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } Index: linux-2.6/fs/jfs/jfs_txnmgr.c =================================================================== --- linux-2.6.orig/fs/jfs/jfs_txnmgr.c +++ linux-2.6/fs/jfs/jfs_txnmgr.c @@ -1279,7 +1279,9 @@ int txCommit(tid_t tid, /* transaction * lazy commit thread finishes processing */ if (tblk->xflag & COMMIT_DELETE) { - atomic_inc(&tblk->u.ip->i_count); + spin_lock(&tblk->u.ip->i_lock); + tblk->u.ip->i_count++; + spin_unlock(&tblk->u.ip->i_lock); /* * Avoid a rare deadlock * Index: linux-2.6/fs/jfs/namei.c =================================================================== --- linux-2.6.orig/fs/jfs/namei.c +++ linux-2.6/fs/jfs/namei.c @@ -831,7 +831,9 @@ static int jfs_link(struct dentry *old_d ip->i_ctime = CURRENT_TIME; dir->i_ctime = dir->i_mtime = CURRENT_TIME; mark_inode_dirty(dir); - atomic_inc(&ip->i_count); + spin_lock(&ip->i_lock); + ip->i_count++; + spin_unlock(&ip->i_lock); iplist[0] = ip; iplist[1] = dir; Index: linux-2.6/fs/minix/namei.c =================================================================== --- linux-2.6.orig/fs/minix/namei.c +++ linux-2.6/fs/minix/namei.c @@ -103,7 +103,9 @@ static int minix_link(struct dentry * ol inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); return add_nondir(dentry, inode); } Index: linux-2.6/fs/nfs/inode.c =================================================================== --- linux-2.6.orig/fs/nfs/inode.c +++ linux-2.6/fs/nfs/inode.c @@ -396,7 +396,7 @@ nfs_fhget(struct super_block *sb, struct dprintk("NFS: nfs_fhget(%s/%Ld ct=%d)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), - atomic_read(&inode->i_count)); + inode->i_count); out: return inode; @@ -1153,7 +1153,7 @@ static int nfs_update_inode(struct inode dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, - atomic_read(&inode->i_count), fattr->valid); + inode->i_count, fattr->valid); if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) goto out_fileid; @@ -1393,11 +1393,18 @@ struct inode *nfs_alloc_inode(struct sup return &nfsi->vfs_inode; } -void nfs_destroy_inode(struct inode *inode) +static void nfs_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } +void nfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, nfs_i_callback); +} + static inline void nfs4_init_once(struct nfs_inode *nfsi) { #ifdef CONFIG_NFS_V4 Index: linux-2.6/fs/nilfs2/mdt.c =================================================================== --- linux-2.6.orig/fs/nilfs2/mdt.c +++ linux-2.6/fs/nilfs2/mdt.c @@ -467,7 +467,7 @@ nilfs_mdt_new_common(struct the_nilfs *n inode->i_sb = sb; /* sb may be NULL for some meta data files */ inode->i_blkbits = nilfs->ns_blocksize_bits; inode->i_flags = 0; - atomic_set(&inode->i_count, 1); + inode->i_count = 1; inode->i_nlink = 1; inode->i_ino = ino; inode->i_mode = S_IFREG; Index: linux-2.6/fs/nilfs2/namei.c =================================================================== --- linux-2.6.orig/fs/nilfs2/namei.c +++ linux-2.6/fs/nilfs2/namei.c @@ -221,7 +221,9 @@ static int nilfs_link(struct dentry *old inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); err = nilfs_add_nondir(dentry, inode); if (!err) Index: linux-2.6/fs/ocfs2/namei.c =================================================================== --- linux-2.6.orig/fs/ocfs2/namei.c +++ linux-2.6/fs/ocfs2/namei.c @@ -719,7 +719,9 @@ static int ocfs2_link(struct dentry *old goto out_commit; } - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); Index: linux-2.6/fs/reiserfs/file.c =================================================================== --- linux-2.6.orig/fs/reiserfs/file.c +++ linux-2.6/fs/reiserfs/file.c @@ -39,7 +39,7 @@ static int reiserfs_file_release(struct BUG_ON(!S_ISREG(inode->i_mode)); /* fast out for when nothing needs to be done */ - if ((atomic_read(&inode->i_count) > 1 || + if ((inode->i_count > 1 || !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || !tail_has_to_be_packed(inode)) && REISERFS_I(inode)->i_prealloc_count <= 0) { @@ -94,7 +94,7 @@ static int reiserfs_file_release(struct if (!err) err = jbegin_failure; - if (!err && atomic_read(&inode->i_count) <= 1 && + if (!err && inode->i_count <= 1 && (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && tail_has_to_be_packed(inode)) { /* if regular file is released by last holder and it has been Index: linux-2.6/fs/reiserfs/namei.c =================================================================== --- linux-2.6.orig/fs/reiserfs/namei.c +++ linux-2.6/fs/reiserfs/namei.c @@ -1142,7 +1142,9 @@ static int reiserfs_link(struct dentry * inode->i_ctime = CURRENT_TIME_SEC; reiserfs_update_sd(&th, inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(dentry, inode); retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_write_unlock(dir->i_sb); Index: linux-2.6/fs/reiserfs/stree.c =================================================================== --- linux-2.6.orig/fs/reiserfs/stree.c +++ linux-2.6/fs/reiserfs/stree.c @@ -1440,7 +1440,7 @@ static int maybe_indirect_to_direct(stru ** reading in the last block. The user will hit problems trying to ** read the file, but for now we just skip the indirect2direct */ - if (atomic_read(&inode->i_count) > 1 || + if (inode->i_count > 1 || !tail_has_to_be_packed(inode) || !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) { /* leave tail in an unformatted node */ Index: linux-2.6/fs/sysv/namei.c =================================================================== --- linux-2.6.orig/fs/sysv/namei.c +++ linux-2.6/fs/sysv/namei.c @@ -126,7 +126,9 @@ static int sysv_link(struct dentry * old inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); return add_nondir(dentry, inode); } Index: linux-2.6/fs/ubifs/dir.c =================================================================== --- linux-2.6.orig/fs/ubifs/dir.c +++ linux-2.6/fs/ubifs/dir.c @@ -557,7 +557,9 @@ static int ubifs_link(struct dentry *old lock_2_inodes(dir, inode); inc_nlink(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); inode->i_ctime = ubifs_current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; Index: linux-2.6/fs/ubifs/super.c =================================================================== --- linux-2.6.orig/fs/ubifs/super.c +++ linux-2.6/fs/ubifs/super.c @@ -342,7 +342,7 @@ static void ubifs_delete_inode(struct in goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(!atomic_read(&inode->i_count)); + ubifs_assert(!inode->i_count); ubifs_assert(inode->i_nlink == 0); truncate_inode_pages(&inode->i_data, 0); Index: linux-2.6/fs/udf/namei.c =================================================================== --- linux-2.6.orig/fs/udf/namei.c +++ linux-2.6/fs/udf/namei.c @@ -1090,7 +1090,9 @@ static int udf_link(struct dentry *old_d inc_nlink(inode); inode->i_ctime = current_fs_time(inode->i_sb); mark_inode_dirty(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); d_instantiate(dentry, inode); unlock_kernel(); Index: linux-2.6/fs/ufs/namei.c =================================================================== --- linux-2.6.orig/fs/ufs/namei.c +++ linux-2.6/fs/ufs/namei.c @@ -178,7 +178,9 @@ static int ufs_link (struct dentry * old inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); - atomic_inc(&inode->i_count); + spin_lock(&inode->i_lock); + inode->i_count++; + spin_unlock(&inode->i_lock); error = ufs_add_nondir(dentry, inode); unlock_kernel(); Index: linux-2.6/fs/ntfs/super.c =================================================================== --- linux-2.6.orig/fs/ntfs/super.c +++ linux-2.6/fs/ntfs/super.c @@ -2921,7 +2921,9 @@ static int ntfs_fill_super(struct super_ } if ((sb->s_root = d_alloc_root(vol->root_ino))) { /* We increment i_count simulating an ntfs_iget(). */ - atomic_inc(&vol->root_ino->i_count); + spin_lock(&vol->root_ino->i_lock); + vol->root_ino->i_count++; + spin_unlock(&vol->root_ino->i_lock); ntfs_debug("Exiting, status successful."); /* Release the default upcase if it has no users. */ mutex_lock(&ntfs_lock); Index: linux-2.6/fs/cifs/inode.c =================================================================== --- linux-2.6.orig/fs/cifs/inode.c +++ linux-2.6/fs/cifs/inode.c @@ -1428,7 +1428,7 @@ int cifs_revalidate(struct dentry *diren } cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld " "jiffies %ld", full_path, direntry->d_inode, - direntry->d_inode->i_count.counter, direntry, + direntry->d_inode->i_count, direntry, direntry->d_time, jiffies)); if (cifsInode->time == 0) { Index: linux-2.6/mm/backing-dev.c =================================================================== --- linux-2.6.orig/mm/backing-dev.c +++ linux-2.6/mm/backing-dev.c @@ -71,7 +71,7 @@ static int bdi_debug_stats_show(struct s * RCU on the reader side */ nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&wb_inode_list_lock); list_for_each_entry(wb, &bdi->wb_list, list) { nr_wb++; list_for_each_entry(inode, &wb->b_dirty, i_list) @@ -81,7 +81,7 @@ static int bdi_debug_stats_show(struct s list_for_each_entry(inode, &wb->b_more_io, i_list) nr_more_io++; } - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -675,11 +675,11 @@ void bdi_destroy(struct backing_dev_info if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&wb_inode_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&wb_inode_list_lock); } bdi_unregister(bdi); Index: linux-2.6/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.orig/fs/hugetlbfs/inode.c +++ linux-2.6/fs/hugetlbfs/inode.c @@ -378,11 +378,12 @@ static void hugetlbfs_delete_inode(struc clear_inode(inode); } -static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) +static void hugetlbfs_forget_inode(struct inode *inode) { if (generic_detach_inode(inode)) { truncate_hugepages(inode, 0); clear_inode(inode); + /* XXX: why no wake_up_inode? */ destroy_inode(inode); } } @@ -665,11 +666,18 @@ static struct inode *hugetlbfs_alloc_ino return &p->vfs_inode; } +static void hugetlbfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); +} + static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); - kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); + call_rcu(&inode->i_rcu, hugetlbfs_i_callback); } static const struct address_space_operations hugetlbfs_aops = { Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -1152,7 +1152,7 @@ __getblk_slow(struct block_device *bdev, * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. + * and mapping->tree_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { Index: linux-2.6/fs/ext2/super.c =================================================================== --- linux-2.6.orig/fs/ext2/super.c +++ linux-2.6/fs/ext2/super.c @@ -157,11 +157,18 @@ static struct inode *ext2_alloc_inode(st return &ei->vfs_inode; } -static void ext2_destroy_inode(struct inode *inode) +static void ext2_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(ext2_inode_cachep, EXT2_I(inode)); } +static void ext2_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ext2_i_callback); +} + static void init_once(void *foo) { struct ext2_inode_info *ei = (struct ext2_inode_info *) foo; Index: linux-2.6/fs/ext3/super.c =================================================================== --- linux-2.6.orig/fs/ext3/super.c +++ linux-2.6/fs/ext3/super.c @@ -469,6 +469,13 @@ static struct inode *ext3_alloc_inode(st return &ei->vfs_inode; } +static void ext3_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); +} + static void ext3_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT3_I(inode)->i_orphan))) { @@ -479,7 +486,7 @@ static void ext3_destroy_inode(struct in false); dump_stack(); } - kmem_cache_free(ext3_inode_cachep, EXT3_I(inode)); + call_rcu(&inode->i_rcu, ext3_i_callback); } static void init_once(void *foo) Index: linux-2.6/fs/proc/inode.c =================================================================== --- linux-2.6.orig/fs/proc/inode.c +++ linux-2.6/fs/proc/inode.c @@ -88,11 +88,18 @@ static struct inode *proc_alloc_inode(st return inode; } -static void proc_destroy_inode(struct inode *inode) +static void proc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + static void init_once(void *foo) { struct proc_inode *ei = (struct proc_inode *) foo; Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c +++ linux-2.6/fs/fat/inode.c @@ -497,11 +497,18 @@ static struct inode *fat_alloc_inode(str return &ei->vfs_inode; } -static void fat_destroy_inode(struct inode *inode) +static void fat_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } +static void fat_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, fat_i_callback); +} + static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; Index: linux-2.6/net/sunrpc/rpc_pipe.c =================================================================== --- linux-2.6.orig/net/sunrpc/rpc_pipe.c +++ linux-2.6/net/sunrpc/rpc_pipe.c @@ -162,11 +162,19 @@ rpc_alloc_inode(struct super_block *sb) } static void -rpc_destroy_inode(struct inode *inode) +rpc_i_callback(struct rcu_head *head) { + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); } +static void +rpc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, rpc_i_callback); +} + static int rpc_pipe_open(struct inode *inode, struct file *filp) { Index: linux-2.6/include/linux/inotify.h =================================================================== --- linux-2.6.orig/include/linux/inotify.h +++ linux-2.6/include/linux/inotify.h @@ -111,7 +111,7 @@ extern void inotify_inode_queue_event(st const char *, struct inode *); extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32, const char *); -extern void inotify_unmount_inodes(struct list_head *); +extern void inotify_unmount_inodes(struct super_block *); extern void inotify_inode_is_dead(struct inode *); extern u32 inotify_get_cookie(void); @@ -161,7 +161,7 @@ static inline void inotify_dentry_parent { } -static inline void inotify_unmount_inodes(struct list_head *list) +static inline void inotify_unmount_inodes(struct super_block *sb) { }