diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/arch/s390/appldata/appldata_base.c linux-2.6.14-ck4/arch/s390/appldata/appldata_base.c --- linux-2.6.14-ck3/arch/s390/appldata/appldata_base.c 2005-08-29 13:31:20.000000000 +1000 +++ linux-2.6.14-ck4/arch/s390/appldata/appldata_base.c 2005-11-12 12:23:13.000000000 +1100 @@ -592,12 +592,15 @@ int appldata_register_ops(struct appldat */ void appldata_unregister_ops(struct appldata_ops *ops) { + void *table; spin_lock(&appldata_ops_lock); - unregister_sysctl_table(ops->sysctl_header); list_del(&ops->list); - kfree(ops->ctl_table); + /* at that point any incoming access will fail */ + table = ops->ctl_table; ops->ctl_table = NULL; spin_unlock(&appldata_ops_lock); + unregister_sysctl_table(ops->sysctl_header); + kfree(table); P_INFO("%s-ops unregistered!\n", ops->name); } /********************** module-ops management **************************/ diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/fs/proc/array.c linux-2.6.14-ck4/fs/proc/array.c --- linux-2.6.14-ck3/fs/proc/array.c 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/fs/proc/array.c 2005-11-12 12:23:13.000000000 +1100 @@ -165,7 +165,7 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "Burst:\t%d\n" + "Bonus:\t%d\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +173,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - p->burst, + p->bonus, p->tgid, p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, pid_alive(p) && p->ptrace ? p->parent->pid : 0, diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/include/linux/proc_fs.h linux-2.6.14-ck4/include/linux/proc_fs.h --- linux-2.6.14-ck3/include/linux/proc_fs.h 2005-08-29 13:31:26.000000000 +1000 +++ linux-2.6.14-ck4/include/linux/proc_fs.h 2005-11-12 12:23:13.000000000 +1100 @@ -66,6 +66,7 @@ struct proc_dir_entry { write_proc_t *write_proc; atomic_t count; /* use count */ int deleted; /* delete flag */ + void *set; }; struct kcore_list { diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/include/linux/sched.h linux-2.6.14-ck4/include/linux/sched.h --- linux-2.6.14-ck3/include/linux/sched.h 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/include/linux/sched.h 2005-11-12 12:23:13.000000000 +1100 @@ -670,7 +670,7 @@ struct task_struct { unsigned long long timestamp; unsigned long runtime, totalrun, ns_debit; - unsigned int burst; + unsigned int bonus; unsigned int slice, time_slice; unsigned long long sched_time; /* sched_clock time spent running */ diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/include/linux/sysctl.h linux-2.6.14-ck4/include/linux/sysctl.h --- linux-2.6.14-ck3/include/linux/sysctl.h 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/include/linux/sysctl.h 2005-11-12 12:23:13.000000000 +1100 @@ -24,6 +24,7 @@ #include struct file; +struct completion; #define CTL_MAXNAME 10 /* how many path components do we allow in a call to sysctl? In other words, what is @@ -930,6 +931,8 @@ struct ctl_table_header { ctl_table *ctl_table; struct list_head ctl_entry; + int used; + struct completion *unregistering; }; struct ctl_table_header * register_sysctl_table(ctl_table * table, diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/kernel/Kconfig.hz linux-2.6.14-ck4/kernel/Kconfig.hz --- linux-2.6.14-ck3/kernel/Kconfig.hz 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/kernel/Kconfig.hz 2005-11-12 12:23:13.000000000 +1100 @@ -21,8 +21,17 @@ choice help 100 HZ is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if - too many timer interrupts are occurring. Laptops should have - better battery life also. + too many timer interrupts are occurring. Laptops may also show + improved battery life. + + config HZ_250_NODEFAULT + bool "250 HZ" + help + 250 HZ is a lousy compromise choice allowing server interactivity + while also showing desktop throughput and no extra power saving on + laptops. Good for when you can't make up your mind. + + Recommend 100 or 1000 instead. config HZ_1000 bool "1000 HZ" @@ -35,5 +44,6 @@ endchoice config HZ int default 100 if HZ_100 + default 250 if HZ_250_NODEFAULT default 1000 if HZ_1000 diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/kernel/sched.c linux-2.6.14-ck4/kernel/sched.c --- linux-2.6.14-ck3/kernel/sched.c 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/kernel/sched.c 2005-11-12 12:23:14.000000000 +1100 @@ -16,9 +16,9 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2005-11-02 New staircase scheduling policy by Con Kolivas with help + * 2005-11-08 New staircase scheduling policy by Con Kolivas with help * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. - * Staircase v12.2 + * Staircase v13 */ #include @@ -633,30 +633,11 @@ static inline void __activate_idle_task( } /* - * burst - extra intervals an interactive task can run for at best priority - * instead of descending priorities. + * Bonus - How much higher than its base priority an interactive task can run. */ -static inline unsigned int burst(task_t *p) +static inline unsigned int bonus(task_t *p) { - if (likely(!rt_task(p))) { - unsigned int task_user_prio = TASK_USER_PRIO(p); - return 39 - task_user_prio; - } else - return p->burst; -} - -static void inc_burst(task_t *p) -{ - unsigned int best_burst; - best_burst = burst(p); - if (p->burst < best_burst) - p->burst++; -} - -static void dec_burst(task_t *p) -{ - if (p->burst) - p->burst--; + return TASK_USER_PRIO(p); } static inline unsigned int rr_interval(task_t * p) @@ -671,33 +652,61 @@ static inline unsigned int rr_interval(t /* * slice - the duration a task runs before getting requeued at its best - * priority and has its burst decremented. + * priority and has its bonus decremented. */ static inline unsigned int slice(task_t *p) { unsigned int slice, rr; + slice = rr = rr_interval(p); if (likely(!rt_task(p))) - slice += burst(p) * rr; + slice += (39 - TASK_USER_PRIO(p)) * rr; return slice; } /* - * sched_interactive - sysctl which allows interactive tasks to have bursts + * We increase our bonus by sleeping more than the time we ran. + * The ratio of sleep to run gives us the cpu% that we last ran and determines + * the maximum bonus we can acquire. + */ +static void inc_bonus(task_t *p, unsigned long totalrun, unsigned long sleep) +{ + unsigned int best_bonus; + + best_bonus = sleep / (totalrun + 1); + if (p->bonus >= best_bonus) + return; + + p->bonus++; + best_bonus = bonus(p); + if (p->bonus > best_bonus) + p->bonus = best_bonus; +} + +static void dec_bonus(task_t *p) +{ + if (p->bonus) + p->bonus--; +} + +/* + * sched_interactive - sysctl which allows interactive tasks to have bonus + * raise its priority. */ int sched_interactive = 1; /* - * effective_prio - dynamic priority dependent on burst. + * effective_prio - dynamic priority dependent on bonus. * The priority normally decreases by one each RR_INTERVAL. - * As the burst increases the priority stays at the top "stair" or + * As the bonus increases the initial priority starts at a higher "stair" or * priority for longer. */ static int effective_prio(task_t *p) { int prio; - unsigned int full_slice, used_slice, first_slice; - unsigned int best_burst, rr; + unsigned int full_slice, used_slice = 0; + unsigned int best_bonus, rr; + if (rt_task(p)) return p->prio; if (batch_task(p)) { @@ -722,20 +731,17 @@ static int effective_prio(task_t *p) return MAX_RT_PRIO; } - best_burst = burst(p); full_slice = slice(p); + if (full_slice > p->slice) + used_slice = full_slice - p->slice; + + best_bonus = bonus(p); + prio = MAX_RT_PRIO + best_bonus; + if (sched_interactive && !sched_compute) + prio -= p->bonus; + rr = rr_interval(p); - used_slice = full_slice - p->slice; - if (p->burst > best_burst) - p->burst = best_burst; - first_slice = rr; - if (sched_interactive && !sched_compute && p->mm) - first_slice *= (p->burst + 1); - prio = MAX_PRIO - 2 - best_burst; - - if (used_slice < first_slice) - return prio; - prio += 1 + (used_slice - first_slice) / rr; + prio += used_slice / rr; if (prio >= MAX_PRIO - 2) prio = MAX_PRIO - 2; return prio; @@ -747,7 +753,7 @@ static void continue_slice(task_t *p) if (total_run >= p->slice) { p->totalrun -= JIFFIES_TO_NS(p->slice); - dec_burst(p); + dec_bonus(p); } else { unsigned int remainder; p->slice -= total_run; @@ -769,16 +775,13 @@ static inline void recalc_task_prio(task /* * Priority is elevated back to best by amount of sleep_time. - * sleep_time is scaled down by number of tasks currently running. */ - if (rq_running > 1) - sleep_time /= rq_running; p->totalrun += p->runtime; if (NS_TO_JIFFIES(p->totalrun) >= p->slice && NS_TO_JIFFIES(sleep_time) < p->slice) { p->flags &= ~PF_NONSLEEP; - dec_burst(p); + dec_bonus(p); p->totalrun -= JIFFIES_TO_NS(p->slice); if (sleep_time > p->totalrun) p->totalrun = 0; @@ -800,7 +803,7 @@ static inline void recalc_task_prio(task if (sleep_time >= p->totalrun) { if (!(p->flags & PF_NONSLEEP)) - inc_burst(p); + inc_bonus(p, p->totalrun, sleep_time); p->totalrun = 0; goto out; } @@ -820,6 +823,8 @@ out: static void activate_task(task_t *p, runqueue_t *rq, int local) { unsigned long long now = sched_clock(); + unsigned long rr = rr_interval(p); + #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -829,7 +834,7 @@ static void activate_task(task_t *p, run } #endif p->slice = slice(p); - p->time_slice = rr_interval(p); + p->time_slice = p->slice % rr ? : rr; recalc_task_prio(p, now, rq->nr_running); p->flags &= ~PF_NONSLEEP; p->prio = effective_prio(p); @@ -1453,10 +1458,10 @@ void fastcall wake_up_new_task(task_t *p this_cpu = smp_processor_id(); cpu = task_cpu(p); - /* - * Forked process gets no burst to prevent fork bombs. + /* + * Forked process gets no bonus to prevent fork bombs. */ - p->burst = 0; + p->bonus = 0; if (likely(cpu == this_cpu)) { current->flags |= PF_NONSLEEP; @@ -2599,10 +2604,10 @@ void scheduler_tick(void) goto out_unlock; p->ns_debit %= NSJIFFY; /* - * Tasks lose burst each time they use up a full slice(). + * Tasks lose bonus each time they use up a full slice(). */ if (!--p->slice) { - dec_burst(p); + dec_bonus(p); p->slice = slice(p); time_slice_expired(p, rq); p->totalrun = 0; @@ -3435,8 +3440,8 @@ void set_user_nice(task_t *p, long nice) delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); p->prio += delta; - if (p->burst > burst(p)) - p->burst = burst(p); + if (p->bonus > bonus(p)) + p->bonus= bonus(p); if (queued) { enqueue_task(p, rq); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/kernel/sysctl.c linux-2.6.14-ck4/kernel/sysctl.c --- linux-2.6.14-ck3/kernel/sysctl.c 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/kernel/sysctl.c 2005-11-12 12:23:14.000000000 +1100 @@ -169,7 +169,7 @@ struct file_operations proc_sys_file_ope extern struct proc_dir_entry *proc_sys_root; -static void register_proc_table(ctl_table *, struct proc_dir_entry *); +static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); #endif @@ -1038,10 +1038,51 @@ static ctl_table dev_table[] = { extern void init_irq_proc (void); +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + void __init sysctl_init(void) { #ifdef CONFIG_PROC_FS - register_proc_table(root_table, proc_sys_root); + register_proc_table(root_table, proc_sys_root, &root_table_header); init_irq_proc(); #endif } @@ -1050,6 +1091,7 @@ int do_sysctl(int __user *name, int nlen void __user *newval, size_t newlen) { struct list_head *tmp; + int error = -ENOTDIR; if (nlen <= 0 || nlen >= CTL_MAXNAME) return -ENOTDIR; @@ -1058,20 +1100,30 @@ int do_sysctl(int __user *name, int nlen if (!oldlenp || get_user(old_len, oldlenp)) return -EFAULT; } + spin_lock(&sysctl_lock); tmp = &root_table_header.ctl_entry; do { struct ctl_table_header *head = list_entry(tmp, struct ctl_table_header, ctl_entry); void *context = NULL; - int error = parse_table(name, nlen, oldval, oldlenp, + + if (!use_table(head)) + continue; + + spin_unlock(&sysctl_lock); + + error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table, &context); kfree(context); + + spin_lock(&sysctl_lock); + unuse_table(head); if (error != -ENOTDIR) - return error; - tmp = tmp->next; - } while (tmp != &root_table_header.ctl_entry); - return -ENOTDIR; + break; + } while ((tmp = tmp->next) != &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); + return error; } asmlinkage long sys_sysctl(struct __sysctl_args __user *args) @@ -1282,12 +1334,16 @@ struct ctl_table_header *register_sysctl return NULL; tmp->ctl_table = table; INIT_LIST_HEAD(&tmp->ctl_entry); + tmp->used = 0; + tmp->unregistering = NULL; + spin_lock(&sysctl_lock); if (insert_at_head) list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); else list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); #ifdef CONFIG_PROC_FS - register_proc_table(table, proc_sys_root); + register_proc_table(table, proc_sys_root, tmp); #endif return tmp; } @@ -1301,10 +1357,13 @@ struct ctl_table_header *register_sysctl */ void unregister_sysctl_table(struct ctl_table_header * header) { - list_del(&header->ctl_entry); + might_sleep(); + spin_lock(&sysctl_lock); + start_unregistering(header); #ifdef CONFIG_PROC_FS unregister_proc_table(header->ctl_table, proc_sys_root); #endif + spin_unlock(&sysctl_lock); kfree(header); } @@ -1315,7 +1374,7 @@ void unregister_sysctl_table(struct ctl_ #ifdef CONFIG_PROC_FS /* Scan the sysctl entries in table and add them all into /proc */ -static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) { struct proc_dir_entry *de; int len; @@ -1351,13 +1410,14 @@ static void register_proc_table(ctl_tabl de = create_proc_entry(table->procname, mode, root); if (!de) continue; + de->set = set; de->data = (void *) table; if (table->proc_handler) de->proc_fops = &proc_sys_file_operations; } table->de = de; if (de->mode & S_IFDIR) - register_proc_table(table->child, de); + register_proc_table(table->child, de, set); } } @@ -1382,6 +1442,13 @@ static void unregister_proc_table(ctl_ta continue; } + /* + * In any case, mark the entry as goner; we'll keep it + * around if it's busy, but we'll know to do nothing with + * its fields. We are under sysctl_lock here. + */ + de->data = NULL; + /* Don't unregister proc entries that are still being used.. */ if (atomic_read(&de->count)) continue; @@ -1395,27 +1462,38 @@ static ssize_t do_rw_proc(int write, str size_t count, loff_t *ppos) { int op; - struct proc_dir_entry *de; + struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); struct ctl_table *table; size_t res; - ssize_t error; - - de = PDE(file->f_dentry->d_inode); - if (!de || !de->data) - return -ENOTDIR; - table = (struct ctl_table *) de->data; - if (!table || !table->proc_handler) - return -ENOTDIR; - op = (write ? 002 : 004); - if (ctl_perm(table, op)) - return -EPERM; + ssize_t error = -ENOTDIR; - res = count; - - error = (*table->proc_handler) (table, write, file, buf, &res, ppos); - if (error) - return error; - return res; + spin_lock(&sysctl_lock); + if (de && de->data && use_table(de->set)) { + /* + * at that point we know that sysctl was not unregistered + * and won't be until we finish + */ + spin_unlock(&sysctl_lock); + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + goto out; + error = -EPERM; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = (*table->proc_handler)(table, write, file, + buf, &res, ppos); + if (!error) + error = res; + out: + spin_lock(&sysctl_lock); + unuse_table(de->set); + } + spin_unlock(&sysctl_lock); + return error; } static int proc_opensys(struct inode *inode, struct file *file) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/Makefile linux-2.6.14-ck4/Makefile --- linux-2.6.14-ck3/Makefile 2005-11-12 12:23:04.000000000 +1100 +++ linux-2.6.14-ck4/Makefile 2005-11-12 12:23:14.000000000 +1100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 14 -EXTRAVERSION = -ck3 +EXTRAVERSION = -ck4 NAME=Cognac Affected Albatross # *DOCUMENTATION* diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck3/net/core/datagram.c linux-2.6.14-ck4/net/core/datagram.c --- linux-2.6.14-ck3/net/core/datagram.c 2005-10-28 20:22:03.000000000 +1000 +++ linux-2.6.14-ck4/net/core/datagram.c 2005-11-12 12:23:14.000000000 +1100 @@ -213,6 +213,10 @@ int skb_copy_datagram_iovec(const struct { int i, err, fraglen, end = 0; struct sk_buff *next = skb_shinfo(skb)->frag_list; + + if (!len) + return 0; + next_skb: fraglen = skb_headlen(skb); i = -1;