From: Ingo Molnar Add a new SCHED_BATCH (3) scheduling policy: such tasks are presumed CPU-intensive, and will acquire a constant +5 priority level penalty. Such policy is nice for workloads that are non-interactive, but which do not want to give up their nice levels. The policy is also useful for workloads that want a deterministic scheduling policy without interactivity causing extra preemptions (between that workload's tasks). Signed-off-by: Ingo Molnar Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- include/linux/sched.h | 7 +++--- kernel/exit.c | 4 ++- kernel/sched.c | 46 +++++++++++++++++++++++++++------------- 3 files changed, 39 insertions(+), 18 deletions(-) diff -puN include/linux/sched.h~sched-add-sched_batch-policy include/linux/sched.h --- devel/include/linux/sched.h~sched-add-sched_batch-policy 2006-01-04 00:35:10.000000000 -0800 +++ devel-akpm/include/linux/sched.h 2006-01-04 00:35:10.000000000 -0800 @@ -160,6 +160,7 @@ extern unsigned long nr_iowait(void); #define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +#define SCHED_BATCH 3 struct sched_param { int sched_priority; @@ -470,9 +471,9 @@ struct signal_struct { /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values - * are inverted: lower p->prio value means higher priority. + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. * * The MAX_USER_RT_PRIO value allows the actual maximum * RT priority to be separate from the value exported to diff -puN kernel/exit.c~sched-add-sched_batch-policy kernel/exit.c --- devel/kernel/exit.c~sched-add-sched_batch-policy 2006-01-04 00:35:10.000000000 -0800 +++ devel-akpm/kernel/exit.c 2006-01-04 00:35:10.000000000 -0800 @@ -242,7 +242,9 @@ static inline void reparent_to_init(void /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; - if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) + if ((current->policy == SCHED_NORMAL || + current->policy == SCHED_BATCH) + && (task_nice(current) < 0)) set_user_nice(current, 0); /* cpus_allowed? */ /* rt_priority? */ diff -puN kernel/sched.c~sched-add-sched_batch-policy kernel/sched.c --- devel/kernel/sched.c~sched-add-sched_batch-policy 2006-01-04 00:35:10.000000000 -0800 +++ devel-akpm/kernel/sched.c 2006-01-04 00:35:10.000000000 -0800 @@ -764,10 +764,14 @@ static int recalc_task_prio(task_t *p, u unsigned long long __sleep_time = now - p->timestamp; unsigned long sleep_time; - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; + if (unlikely(p->policy == SCHED_BATCH)) + sleep_time = 0; + else { + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + } if (likely(sleep_time > 0)) { /* @@ -3553,7 +3557,7 @@ void set_user_nice(task_t *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * not SCHED_NORMAL: + * not SCHED_NORMAL/SCHED_BATCH: */ if (rt_task(p)) { p->static_prio = NICE_TO_PRIO(nice); @@ -3707,10 +3711,16 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL) + if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { p->prio = MAX_RT_PRIO-1 - p->rt_priority; - else + } else { p->prio = p->static_prio; + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + } set_bias_prio(p); } @@ -3735,29 +3745,35 @@ recheck: if (policy < 0) policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL) + policy != SCHED_NORMAL && policy != SCHED_BATCH) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and + * SCHED_BATCH is 0. */ if (param->sched_priority < 0 || (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) + if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) + != (param->sched_priority == 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - /* can't change policy */ - if (policy != p->policy && - !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + /* + * can't change policy, except between SCHED_NORMAL + * and SCHED_BATCH: + */ + if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && + (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) return -EPERM; /* can't increase priority */ - if (policy != SCHED_NORMAL && + if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && param->sched_priority > p->rt_priority && param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) @@ -4235,6 +4251,7 @@ asmlinkage long sys_sched_get_priority_m ret = MAX_USER_RT_PRIO-1; break; case SCHED_NORMAL: + case SCHED_BATCH: ret = 0; break; } @@ -4258,6 +4275,7 @@ asmlinkage long sys_sched_get_priority_m ret = 1; break; case SCHED_NORMAL: + case SCHED_BATCH: ret = 0; } return ret; _