Subject: [PATCH] First working version of private process spaces. From: Eric W. Biederman Date: 1128938348 -0600 More todo but this roughly works. --- arch/i386/kernel/process.c | 7 +++-- fs/proc/base.c | 60 +++++++++++++++++++++++++++++++++++++++----- fs/proc/inode.c | 2 + fs/proc/internal.h | 1 + fs/proc/root.c | 43 ++++++++++++++++++++++++++++++++ include/linux/pspace.h | 13 ++++------ kernel/fork.c | 28 ++++++++++++++------- kernel/pid.c | 11 ++++---- kernel/sys.c | 18 +++++-------- kernel/timer.c | 4 +-- 10 files changed, 142 insertions(+), 45 deletions(-) 79d3dc45a5849d28fef2fca302c7dddfb4890cda diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 7a14fdf..7f6cbc4 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -738,7 +739,7 @@ struct task_struct fastcall * __switch_t asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return pid_to_user(do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL)); } asmlinkage int sys_clone(struct pt_regs regs) @@ -753,7 +754,7 @@ asmlinkage int sys_clone(struct pt_regs child_tidptr = (int __user *)regs.edi; if (!newsp) newsp = regs.esp; - return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); + return pid_to_user(do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr)); } /* @@ -768,7 +769,7 @@ asmlinkage int sys_clone(struct pt_regs */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return pid_to_user(do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL)); } /* diff --git a/fs/proc/base.c b/fs/proc/base.c index d8d1f10..6b1062d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1377,6 +1377,28 @@ static int pid_delete_dentry(struct dent return !pid_alive(proc_task(dentry->d_inode)); } +static int pid_task_hash(struct dentry *dentry, struct qstr *qstr) +{ + int tgid = proc_name_to_int(qstr); + /* Just use the pid as the hash value */ + tgid = pid_from_user(tgid); + qstr->hash = tgid; + return 0; +} + +static int pid_task_cmp(struct dentry *dentry, struct qstr *cached, struct qstr *user) +{ + int cached_tgid, user_tgid; + cached_tgid = proc_name_to_int(cached); + user_tgid = proc_name_to_int(user); + if (cached_tgid == user_tgid) { + /* 2 equal pids */ + return 0; + } + return 1; +} + + static struct dentry_operations tid_fd_dentry_operations = { .d_revalidate = tid_fd_revalidate, @@ -1389,6 +1411,14 @@ static struct dentry_operations pid_dent .d_delete = pid_delete_dentry, }; +static struct dentry_operations pid_task_dentry_operations = +{ + .d_hash = pid_task_hash, + .d_compare = pid_task_cmp, + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + static struct dentry_operations pid_base_dentry_operations = { .d_revalidate = pid_revalidate, @@ -1398,10 +1428,10 @@ static struct dentry_operations pid_base /* Lookups */ -static unsigned name_to_int(struct dentry *dentry) +unsigned proc_name_to_int(struct qstr *qstr) { - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; + const char *name = qstr->name; + int len = qstr->len; unsigned n = 0; if (len > 1 && *name == '0') @@ -1424,7 +1454,7 @@ out: static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { struct task_struct *task = proc_task(dir); - unsigned fd = name_to_int(dentry); + unsigned fd = proc_name_to_int(&dentry->d_name); struct file * file; struct files_struct * files; struct inode *inode; @@ -1593,6 +1623,7 @@ static struct dentry *proc_pident_lookup ei = PROC_I(inode); inode->i_mode = p->mode; + dentry->d_op = &pid_dentry_operations; /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc// without very good reasons. @@ -1602,6 +1633,7 @@ static struct dentry *proc_pident_lookup inode->i_nlink = 2 + get_tid_list(2, NULL, dir); inode->i_op = &proc_task_inode_operations; inode->i_fop = &proc_task_operations; + dentry->d_op = &pid_task_dentry_operations; break; case PROC_TID_FD: case PROC_TGID_FD: @@ -1748,7 +1780,6 @@ static struct dentry *proc_pident_lookup iput(inode); return ERR_PTR(-EINVAL); } - dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); return NULL; @@ -1929,9 +1960,17 @@ struct dentry *proc_pid_lookup(struct in d_add(dentry, inode); return NULL; } - tgid = name_to_int(dentry); + tgid = proc_name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; + tgid = pid_from_user(tgid); + + /* Normalize the dentry name */ + if (dentry->d_name.name != dentry->d_iname) + kfree(dentry->d_name.name); + dentry->d_name.name = dentry->d_iname; + dentry->d_name.len = snprintf(dentry->d_iname, sizeof(dentry->d_iname), + "%d", tgid); read_lock(&tasklist_lock); task = find_task_by_pid(tgid); @@ -1988,9 +2027,16 @@ static struct dentry *proc_task_lookup(s struct inode *inode; unsigned tid; - tid = name_to_int(dentry); + tid = proc_name_to_int(&dentry->d_name); if (tid == ~0U) goto out; + tid = pid_from_user(tid); + /* Normalize the dentry name */ + if (dentry->d_name.name != dentry->d_iname) + kfree(dentry->d_name.name); + dentry->d_name.name = dentry->d_iname; + dentry->d_name.len = snprintf(dentry->d_iname, sizeof(dentry->d_iname), + "%d", tid); read_lock(&tasklist_lock); task = find_task_by_pid(tid); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index effa6c0..8d2f330 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -20,6 +20,7 @@ #include extern void free_proc_entry(struct proc_dir_entry *); +extern struct dentry_operations proc_root_dops; static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) { @@ -210,6 +211,7 @@ int proc_fill_super(struct super_block * s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; + s->s_root->d_op = &proc_root_dops; return 0; out_no_root: diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e55198..4d582ae 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -36,6 +36,7 @@ extern int proc_tid_stat(struct task_str extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern unsigned proc_name_to_int(struct qstr *qstr); static inline struct task_struct *proc_task(struct inode *inode) { diff --git a/fs/proc/root.c b/fs/proc/root.c index aef148f..ed26e10 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include "internal.h" struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; @@ -79,6 +81,47 @@ void __init proc_root_init(void) proc_bus = proc_mkdir("bus", NULL); } +static int proc_root_hash(struct dentry *dentry, struct qstr *qstr) +{ + int tgid = proc_name_to_int(qstr); + if (tgid == ~0) { + return 0; /* Use the default hash */ + } + /* Just use the pid as the hash value */ + tgid = pid_from_user(tgid); + qstr->hash = tgid; + return 0; +} + +static int proc_root_cmp(struct dentry *dentry, struct qstr *cached, struct qstr *user) +{ + int cached_tgid, user_tgid; + cached_tgid = proc_name_to_int(cached); + user_tgid = proc_name_to_int(user); + if (user_tgid != ~0) { + user_tgid = pid_from_user(user_tgid); + } + if (cached_tgid == user_tgid) { + if (cached_tgid == ~0) { + /* 2 non pids see if their names are equal */ + if ((cached->len == user->len) && + (memcmp(cached->name, user->name, cached->len) == 0)) + { + return 0; + } + } else { + /* 2 equal pids */ + return 0; + } + } + return 1; +} + +struct dentry_operations proc_root_dops = { + .d_hash = proc_root_hash, + .d_compare = proc_root_cmp, +}; + static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { /* diff --git a/include/linux/pspace.h b/include/linux/pspace.h index 3735906..2a17045 100644 --- a/include/linux/pspace.h +++ b/include/linux/pspace.h @@ -27,11 +27,6 @@ extern struct pspace init_pspace; #define INVALID_PID 0x7fffffff -static inline int pid_visible(struct task_struct *p) -{ - return (p->pid > current->pspace->offset) && (p->pid < current->pspace->max); -} - static inline int pid_from_user(int pid) { if (pid < current->pspace->max) @@ -43,10 +38,14 @@ static inline int pid_from_user(int pid) static inline int pid_to_user(int pid) { - pid -= current->pspace->offset; - return pid; + return pid - current->pspace->offset; } +static inline int pid_visible(struct task_struct *p) +{ + int pid = pid_to_user(p->pid); + return (pid > 0) && (pid < current->pspace->max); +} static inline void get_pspace(struct pspace *pspace) { diff --git a/kernel/fork.c b/kernel/fork.c index 5ec88d3..23dd98a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -946,10 +946,13 @@ static task_t *copy_process(unsigned lon p->did_exec = 0; copy_flags(clone_flags, p); p->pid = pid; + if ((retval = copy_pspace(clone_flags, p))) + goto bad_fork_cleanup; + retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(pid_to_user(p->pid), parent_tidptr)) - goto bad_fork_cleanup; + goto bad_fork_cleanup_pspace; p->proc_dentry = NULL; @@ -989,7 +992,7 @@ static task_t *copy_process(unsigned lon if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup; + goto bad_fork_cleanup_pspace; } #endif @@ -1135,13 +1138,6 @@ static task_t *copy_process(unsigned lon spin_unlock(¤t->sighand->siglock); } - /* It is important that we don't have an error - * handling path after this or the original - * pid will be freed twice. - */ - if ((retval = copy_pspace(clone_flags, p))) - goto bad_fork_cleanup_namespace; - /* * inherit ioprio */ @@ -1153,6 +1149,17 @@ static task_t *copy_process(unsigned lon cpuset_fork(p); + if (clone_flags & CLONE_NEWPSPACE) { + /* Free the pid in the wrong process space */ + free_pidmap(current->pspace, pid, 1); + pid = 0; + /* Become the process group and session leader */ + p->signal->leader = 1; /* FIXME Is this correct? */ + p->signal->pgrp = p->pid; + p->signal->session = p->pid; + p->signal->tty = NULL; + p->signal->tty_old_pgrp = 0; + } attach_pid(p, PIDTYPE_PID, p->pid); attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { @@ -1200,6 +1207,8 @@ bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); #endif +bad_fork_cleanup_pspace: + exit_pspace(p); bad_fork_cleanup: if (p->binfmt) module_put(p->binfmt->module); @@ -1284,6 +1293,7 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + pid = p->pid; if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); diff --git a/kernel/pid.c b/kernel/pid.c index 70feecd..402cf38 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -317,21 +317,20 @@ int copy_pspace(int flags, struct task_s } /* Allocate the new pidspace structure */ - new = new_pspace(pids, pid); + new = new_pspace(pids, pid - 1); if (!new) { free_pidmap(p->pspace, pid, pids); put_pspace(p->pspace); return -ENOMEM; } - /* Free the orignal pid */ - free_pidmap(p->pspace, p->pid, 1); - + /* Allocate the new pid */ + pid = alloc_pidmap(new, 1); + /* Setup the new pspace and pid */ new->parent = p->pspace; p->pspace = new; p->pid = pid; - p->tgid = pid; return 0; } @@ -350,7 +349,7 @@ void __put_pspace(struct pspace *pspace) pids = pspace->max; pages = (pids + 8*PAGE_SIZE - 1)/PAGE_SIZE/8; for (i = 0; i < pages; i++) { - BUG_ON(&atomic_read(map[i].nr_free) != BITS_PER_PAGE); + BUG_ON(atomic_read(&map[i].nr_free) != BITS_PER_PAGE); free_page((unsigned long)map[i].page); } kfree(pspace); diff --git a/kernel/sys.c b/kernel/sys.c index 35524b5..5f4a13b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1074,17 +1074,12 @@ asmlinkage long sys_setpgid(pid_t pid, p struct task_struct *p; int err = -EINVAL; - if (!pid) - pid = current->pid; - if (!pgid) - pgid = pid; - if (pgid < 0) - return -EINVAL; if (pid < 0) return -EINVAL; - pid = pid_from_user(pid); - pgid = pid_from_user(pgid); - + if (pgid < 0) + return -EINVAL; + pid = (!pid)? current->pid : pid_from_user(pid); + pgid = (!pgid)? pid : pid_from_user(pgid); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -1174,7 +1169,7 @@ asmlinkage long sys_getpgid(pid_t pid) asmlinkage long sys_getpgrp(void) { /* SMP - assuming writes are word atomic this is fine */ - return process_group(current); + return pid_to_user(process_group(current)); } #endif @@ -1182,7 +1177,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) { - return current->signal->session; + return pid_to_user(current->signal->session); } else { int retval; struct task_struct *p; @@ -1195,6 +1190,7 @@ asmlinkage long sys_getsid(pid_t pid) retval = security_task_getsid(p); if (!retval) retval = p->signal->session; + retval = pid_to_user(retval); } read_unlock(&tasklist_lock); return retval; diff --git a/kernel/timer.c b/kernel/timer.c index cc1378b..efd2c03 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1024,7 +1024,7 @@ asmlinkage long sys_getppid(void) parent = me->group_leader->real_parent; for (;;) { - pid = parent->tgid; + pid = pid_visible(parent)? pid_to_user(parent->tgid) : 0; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) { struct task_struct *old = parent; @@ -1041,7 +1041,7 @@ asmlinkage long sys_getppid(void) #endif break; } - return pid_to_user(pid); + return pid; } asmlinkage long sys_getuid(void) -- 1.0.GIT