From: Eric W. Biederman Every inode in /proc holds a reference to a struct task_struct. If a directory or file is opened and remains open after the the task exits this pinning continues. With 8K stacks on a 32bit machine the amount pinned per file descriptor is about 10K. Normally I would figure a reasonable per user process limit is about 100 processes. With 80 processes, with a 1000 file descriptors each I can trigger the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless data. This patch replaces the struct task_struct pointer with a pointer to a struct task_ref which has a struct task_struct pointer. The so the pinning of dead tasks does not happen. The code now has to contend with the fact that the task may now exit at any time. Which is a little but not muh more complicated. With this change it takes about 1000 processes each opening up 1000 file descriptors before I can trigger the OOM killer. Much better. Signed-off-by: Eric W. Biederman Cc: Trond Myklebust Cc: Paul Jackson Cc: Oleg Nesterov Cc: Albert Cahalan Signed-off-by: Andrew Morton --- fs/proc/base.c | 355 +++++++++++++++++++++++++++----------- fs/proc/inode.c | 9 fs/proc/internal.h | 15 + fs/proc/task_mmu.c | 75 +++++--- include/linux/proc_fs.h | 8 kernel/cpuset.c | 27 ++ mm/mempolicy.c | 6 7 files changed, 350 insertions(+), 145 deletions(-) diff -puN fs/proc/base.c~proc-dont-lock-task_structs-indefinitely fs/proc/base.c --- devel/fs/proc/base.c~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/fs/proc/base.c 2006-05-19 16:01:36.000000000 -0700 @@ -303,12 +303,15 @@ static struct pid_entry tid_attr_stuff[] static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -340,10 +343,29 @@ static struct fs_struct *get_fs_struct(s return fs; } +static int get_nr_threads(struct task_struct *tsk) +{ + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); + } + return count; +} + static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->pwdmnt); @@ -357,8 +379,14 @@ static int proc_cwd_link(struct inode *i static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -546,16 +574,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -612,17 +643,21 @@ static struct file_operations proc_mount extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -649,18 +684,27 @@ static ssize_t proc_info_read(struct fil struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -677,12 +721,15 @@ static int mem_open(struct inode* inode, static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -732,6 +779,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -744,15 +793,20 @@ static ssize_t mem_write(struct file * f { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -775,6 +829,9 @@ static ssize_t mem_write(struct file * f } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -805,12 +862,17 @@ static struct file_operations proc_mem_o static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; @@ -825,7 +887,7 @@ static ssize_t oom_adjust_read(struct fi static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task; char buffer[PROC_NUMBUF], *end; int oom_adjust; @@ -841,7 +903,11 @@ static ssize_t oom_adjust_write(struct f return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -858,12 +924,15 @@ static ssize_t proc_loginuid_read(struct size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -873,13 +942,12 @@ static ssize_t proc_loginuid_write(struc struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != proc_tref(inode)->task) return -EPERM; if (count > PAGE_SIZE) @@ -902,7 +970,7 @@ static ssize_t proc_loginuid_write(struc goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -921,13 +989,16 @@ static struct file_operations proc_login static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -941,29 +1012,43 @@ static ssize_t seccomp_read(struct file static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -990,7 +1075,7 @@ static int proc_check_dentry_visible(str /* See if the the two tasks share a commone set of * file descriptors. If so everything is visible. */ - task = proc_task(inode); + task = get_proc_task(inode); if (!task) goto out; files = get_files_struct(current); @@ -1001,6 +1086,7 @@ static int proc_check_dentry_visible(str put_files_struct(task_files); if (files) put_files_struct(files); + put_task_struct(task); if (!error) goto out; @@ -1101,7 +1187,7 @@ static int proc_readfd(struct file * fil { struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; char buf[PROC_NUMBUF]; @@ -1109,8 +1195,8 @@ static int proc_readfd(struct file * fil struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1159,6 +1245,8 @@ static int proc_readfd(struct file * fil put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1170,16 +1258,18 @@ static int proc_pident_readdir(struct fi int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1265,14 +1355,13 @@ static struct inode *proc_pid_make_inode inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; + ei->tref = tref_get_by_task(task); + if (!tref_task(ei->tref)) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; if (task_dumpable(task)) { @@ -1298,13 +1387,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1312,37 +1409,63 @@ static int pid_revalidate(struct dentry inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; @@ -1354,7 +1477,7 @@ static int pid_delete_dentry(struct dent * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_tref(dentry->d_inode)->task; } static struct dentry_operations tid_fd_dentry_operations = @@ -1396,7 +1519,7 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); struct dentry *result = ERR_PTR(-ENOENT); struct file * file; @@ -1404,10 +1527,10 @@ static struct dentry *proc_lookupfd(stru struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) @@ -1442,6 +1565,8 @@ static struct dentry *proc_lookupfd(stru if (tid_fd_revalidate(dentry, NULL)) result = NULL; out: + put_task_struct(task); +out_no_task: return result; out_unlock2: @@ -1485,12 +1610,17 @@ static ssize_t proc_pid_attr_read(struct struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1498,6 +1628,9 @@ static ssize_t proc_pid_attr_read(struct if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1507,26 +1640,36 @@ static ssize_t proc_pid_attr_write(struc struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1548,15 +1691,15 @@ static struct dentry *proc_pident_lookup { struct inode *inode; struct dentry *error; - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1741,6 +1884,8 @@ static struct dentry *proc_pident_lookup if (pid_revalidate(dentry, NULL)) error = NULL; out: + put_task_struct(task); +out_no_task: return error; } @@ -1764,10 +1909,12 @@ static struct file_operations proc_tid_b static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1809,10 +1956,12 @@ static struct dentry *proc_tid_attr_look static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1974,10 +2123,13 @@ static struct dentry *proc_task_lookup(s { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; @@ -2017,6 +2169,8 @@ static struct dentry *proc_task_lookup(s out_drop_task: put_task_struct(task); out: + put_task_struct(leader); +out_no_task: return result; } @@ -2156,12 +2310,7 @@ static struct task_struct *first_tid(str /* If nr exceeds the number of threads there is nothing todo */ if (nr) { - int threads = 0; - task_lock(leader); - if (leader->signal) - threads = atomic_read(&leader->signal->count); - task_unlock(leader); - if (nr >= threads) + if (nr >= get_nr_threads(leader)) goto done; } @@ -2211,15 +2360,15 @@ static int proc_task_readdir(struct file char buf[PROC_NUMBUF]; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - struct task_struct *leader = proc_task(inode); + struct task_struct *leader = get_proc_task(inode); struct task_struct *task; int retval = -ENOENT; ino_t ino; int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(leader)) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2259,20 +2408,22 @@ static int proc_task_readdir(struct file } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); - if (pid_alive(p)) { - task_lock(p); - if (p->signal) - stat->nlink += atomic_read(&p->signal->count); - task_unlock(p); + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); } return 0; diff -puN fs/proc/inode.c~proc-dont-lock-task_structs-indefinitely fs/proc/inode.c --- devel/fs/proc/inode.c~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/fs/proc/inode.c 2006-05-19 16:01:36.000000000 -0700 @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + tref_put(PROC_I(inode)->tref); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(st ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; + ei->tref = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; diff -puN fs/proc/internal.h~proc-dont-lock-task_structs-indefinitely fs/proc/internal.h --- devel/fs/proc/internal.h~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/fs/proc/internal.h 2006-05-19 16:01:36.000000000 -0700 @@ -10,6 +10,7 @@ */ #include +#include struct vmalloc_info { unsigned long used; @@ -41,13 +42,23 @@ extern struct file_operations proc_maps_ extern struct file_operations proc_numa_maps_operations; extern struct file_operations proc_smaps_operations; +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct task_ref *proc_tref(struct inode *inode) +{ + return PROC_I(inode)->tref; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_tref_task(proc_tref(inode)); } static inline int proc_fd(struct inode *inode) diff -puN fs/proc/task_mmu.c~proc-dont-lock-task_structs-indefinitely fs/proc/task_mmu.c --- devel/fs/proc/task_mmu.c~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/fs/proc/task_mmu.c 2006-05-19 16:01:36.000000000 -0700 @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, s { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -301,12 +305,16 @@ static int show_smap(struct seq_file *m, static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -317,11 +325,15 @@ static void *m_start(struct seq_file *m, if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_tref_task(priv->tref); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -343,24 +355,22 @@ static void *m_start(struct seq_file *m, } if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ + priv->tail_vma = NULL; /* After gate vma */ out: if (vma) return vma; /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; + m->version = (priv->tail_vma != NULL)? 0: -1UL; up_read(&mm->mmap_sem); mmput(mm); - return tail_vma; + return priv->tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -369,17 +379,27 @@ static void m_stop(struct seq_file *m, v static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, @@ -397,11 +417,18 @@ static struct seq_operations proc_pid_sm static int do_maps_open(struct inode *inode, struct file *file, struct seq_operations *ops) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->tref = proc_tref(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } } return ret; } @@ -415,7 +442,7 @@ struct file_operations proc_maps_operati .open = maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #ifdef CONFIG_NUMA @@ -437,7 +464,7 @@ struct file_operations proc_numa_maps_op .open = numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #endif @@ -450,5 +477,5 @@ struct file_operations proc_smaps_operat .open = smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; diff -puN include/linux/proc_fs.h~proc-dont-lock-task_structs-indefinitely include/linux/proc_fs.h --- devel/include/linux/proc_fs.h~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/include/linux/proc_fs.h 2006-05-19 16:01:36.000000000 -0700 @@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list #endif struct proc_inode { - struct task_struct *task; + struct task_ref *tref; int fd; union { int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **); @@ -266,4 +266,10 @@ static inline struct proc_dir_entry *PDE return PROC_I(inode)->pde; } +struct proc_maps_private { + struct task_ref *tref; + struct task_struct *task; + struct vm_area_struct *tail_vma; +}; + #endif /* _LINUX_PROC_FS_H */ diff -puN mm/mempolicy.c~proc-dont-lock-task_structs-indefinitely mm/mempolicy.c --- devel/mm/mempolicy.c~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/mm/mempolicy.c 2006-05-19 16:01:36.000000000 -0700 @@ -1816,7 +1816,7 @@ static inline void check_huge_range(stru int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1832,7 +1832,7 @@ int show_numa_map(struct seq_file *m, vo return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + get_vma_policy(priv->task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1886,7 +1886,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } diff -puN kernel/cpuset.c~proc-dont-lock-task_structs-indefinitely kernel/cpuset.c --- devel/kernel/cpuset.c~proc-dont-lock-task_structs-indefinitely 2006-05-19 16:01:36.000000000 -0700 +++ devel-akpm/kernel/cpuset.c 2006-05-19 16:01:36.000000000 -0700 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -2434,31 +2435,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct task_ref *tref; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; + + retval = -ESRCH; + tref = m->private; + tsk = get_tref_task(tref); + if (!tsk) + goto out_free; - tsk = m->private; + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct task_ref *tref = PROC_I(inode)->tref; + return single_open(file, proc_cpuset_show, tref); } struct file_operations proc_cpuset_operations = { _