GIT 0e77b48d6705dcb8b2954c6495b4c5a32c1eb85a git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git commit Author: Ingo Molnar Date: Wed Oct 10 14:00:21 2007 +0200 sched: do not wakeup-preempt with SCHED_BATCH tasks do not wakeup-preempt with SCHED_BATCH tasks, their preemption is batched too, driven by the tick. Signed-off-by: Ingo Molnar commit 287baaf371b9c964af107de8e5d8725668c0fe5f Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:21 2007 +0200 sched: generate uevents for user creation/destruction Generate uevents when a user is being created/destroyed. These events can be used to configure cpu share of a new user. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar commit 636690cb20c57ec9291199dbcee558b134f159dd Author: Ingo Molnar Date: Wed Oct 10 14:00:20 2007 +0200 sched: do not normalize kernel threads via SysRq-N do not normalize kernel threads via SysRq-N: the migration threads, softlockup threads, etc. might be essential for the system to function properly. So only zap user tasks. pointed out by Andi Kleen. Signed-off-by: Ingo Molnar commit d74d599c2ee03657e00adc940d69d7d0ec893c25 Author: Andi Kleen Date: Wed Oct 10 14:00:20 2007 +0200 sched: remove stale comment from sched_group_set_shares() remove stale comment from sched_group_set_shares(). Function never returns -EINVAL. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar commit 8acaa0d18e988990016cac687d4ccccbb0d3cb3f Author: Ingo Molnar Date: Wed Oct 10 14:00:20 2007 +0200 sched: clean up is_migration_thread() clean up is_migration_thread() and turn it into an inline function. Signed-off-by: Ingo Molnar commit e1a39f66b09e920fddf08e07a016e1354246ae8d Author: Andi Kleen Date: Wed Oct 10 14:00:20 2007 +0200 sched: cleanup: refactor normalize_rt_tasks Replace a particularly ugly ifdef with an inline and a new macro. Also split up the function to be easier to read. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar commit fdf49f9e126e56c2804f4a04daa18ba731ec206b Author: Andi Kleen Date: Wed Oct 10 14:00:20 2007 +0200 sched: cleanup: refactor common code of sleep_on / wait_for_completion Refactor common code of sleep_on / wait_for_completion These functions were largely cut'n'pasted. This moves the common code into single helpers instead. Advantage is about 1k less code on x86-64 and 91 lines of code removed. It adds one function call to the non timeout version of the functions; i don't expect this to be measurable. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar commit ab8e5be74026688344287ba474caa220f4536758 Author: Andi Kleen Date: Wed Oct 10 14:00:20 2007 +0200 sched: cleanup: remove unnecessary gotos Replace loops implemented with gotos with real loops. Replace err = ...; goto x; x: return err; with return ...; No functional changes. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar commit c77e4323f1411d4fbe39d2dd69ed61cc0417fd09 Author: Ingo Molnar Date: Wed Oct 10 14:00:20 2007 +0200 sched: update comment update comment: clarify time-slices and remove obsolete tuning detail. Signed-off-by: Ingo Molnar commit 6381da536d17b79242ef9c1a5d988eed728df83e Author: Mike Galbraith Date: Wed Oct 10 14:00:20 2007 +0200 sched: prevent wakeup over-scheduling Prevent wakeup over-scheduling. Once a task has been preempted by a task of the same or lower priority, it becomes ineligible for repeated preemption by same until it has been ticked, or slept. Instead, the task is marked for preemption at the next tick. Tasks of higher priority still preempt immediately. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar commit 86cbc94f642e280303f29f4fba4305ea839967ac Author: Peter Zijlstra Date: Wed Oct 10 14:00:20 2007 +0200 sched: disable forced preemption by default Implement feature bit to disable forced preemption. This way it can be checked whether a workload is overscheduling or not. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 60919b6d3c31f7f3f8c38292df22369e49af4998 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:20 2007 +0200 sched: fix group scheduling for SCHED_BATCH The following patch (sched: disable sleeper_fairness on SCHED_BATCH) seems to break GROUP_SCHED. Although, it may be 'oops'-less due to the possibility of 'p' being always a valid address. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar commit 6e9cd1c4e1fdc091e0543fd108db24bf7ebd8e0a Author: Zou Nan hai Date: Wed Oct 10 14:00:20 2007 +0200 sched: some proc entries are missed in sched_domain sys_ctl debug code cache_nice_tries and flags entry do not appear in proc fs sched_domain directory, because ctl_table entry is skipped. This patch fixes the issue. Signed-off-by: Zou Nan hai Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar commit 7d3e08fc414ee1f4bfbb78776ef7c2303da830d6 Author: Gautham R Shenoy Date: Wed Oct 10 14:00:20 2007 +0200 sched: fix rt ptracer monopolizing CPU yield() in wait_task_inactive(), can cause a high priority thread to be scheduled back in, and there by loop forever while it is waiting for some lower priority thread which is unfortunately still on the runqueue. Use schedule_timeout_uninterruptible(1) instead. Signed-off-by: Gautham R Shenoy Credit: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar commit dc69715cd6530890fb761d1c550acc7aaeb9c570 Author: Dhaval Giani Date: Wed Oct 10 14:00:19 2007 +0200 sched: group scheduling, sysfs tunables Add tunables in sysfs to modify a user's cpu share. A directory is created in sysfs for each new user in the system. /sys/kernel/uids//cpu_share Reading this file returns the cpu shares granted for the user. Writing into this file modifies the cpu share for the user. Only an administrator is allowed to modify a user's cpu share. Ex: # cd /sys/kernel/uids/ # cat 512/cpu_share 1024 # echo 2048 > 512/cpu_share # cat 512/cpu_share 2048 # Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar commit ea04c919c3a0c5f1be7d4907ab09cd5a76d8d514 Author: Peter Zijlstra Date: Wed Oct 10 14:00:19 2007 +0200 sched: disable sleeper_fairness on SCHED_BATCH disable sleeper fairness for batch tasks - they are about batch processing after all. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 7f5efd510abd70929ec97791d6f570039f96a87e Author: Peter Zijlstra Date: Wed Oct 10 14:00:19 2007 +0200 sched: another wakeup_granularity fix unit mis-match: wakeup_gran was used against a vruntime Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 31080185c39640f37a989989b449c0653f104a72 Author: Paul E. McKenney Date: Wed Oct 10 14:00:19 2007 +0200 sched: export cpu_clock() export cpu_clock() - the preferred API instead of sched_clock(). Signed-off-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar commit d3b07329fad9134f681ad03863725c284726a7b7 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: fix: move the CPU check into ->task_new_fair() noticed by Peter Zijlstra: fix: move the CPU check into ->task_new_fair(), this way we can call place_entity() and get child ->vruntime right at initial wakeup time. (without this there can be large latencies) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit b9751a2e5983e036243c1f6a54b38a46b4a83e94 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup: function prototype cleanups noticed by Thomas Gleixner: cleanup: function prototype cleanups - move into single line wherever possible. Signed-off-by: Ingo Molnar commit 743b72c5b6038ba6ade1724f22333bdaca1d630c Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup: rename task_grp to task_group cleanup: rename task_grp to task_group. No need to save two characters and 'grp' is annoying to read. Signed-off-by: Ingo Molnar commit 250ce24cb48a2aced73711a6c6d331d228158de6 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG cleanup: rename SCHED_FEAT_USE_TREE_AVG to SCHED_FEAT_TREE_AVG, to make SCHED_FEAT_ names more consistent. Signed-off-by: Ingo Molnar commit 4cbd838acff17dc6804d3c23e9bcbc594b7a8e3f Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: kfree(NULL) is valid kfree(NULL) is valid. pointed out by checkpatch.pl. the fix shrinks the code a bit: text data bss dec hex filename 40024 3842 100 43966 abbe sched.o.before 40002 3842 100 43944 aba8 sched.o.after Signed-off-by: Ingo Molnar commit ced05c552b8e1629c31b1c43a956f7a557e0e944 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: style cleanup fix up __setup() style bug - noticed via checkpatch.pl. Signed-off-by: Ingo Molnar commit a40ccbe4414680569320256a22c6d25ee1dc38e8 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: break out if printing a warning in sched_domain_debug() checkpatch.pl and Andy Whitcroft noticed the following bug: we did not break out after printing an error. Signed-off-by: Ingo Molnar commit 458660f538d8e59285f4ef6536c272744ffb7206 Author: Ingo Molnar Date: Wed Oct 10 14:00:19 2007 +0200 sched: run sched_domain_debug() if CONFIG_SCHED_DEBUG=y run sched_domain_debug() if CONFIG_SCHED_DEBUG=y, instead of relying on the hand-crafted SCHED_DOMAIN_DEBUG switch. Signed-off-by: Ingo Molnar commit 9dbea636b7ebe0c20e55171820ac45bd1550ab31 Author: Mike Galbraith Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup, remove the TASK_NONINTERACTIVE flag Here's another piece of low hanging obsolete fruit. Remove obsolete TASK_NONINTERACTIVE. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar commit 8baf75d3e4e497b15da5958f0a8e1b39cffc8378 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup, make dequeue_entity() and update_stats_wait_end() similar make dequeue_entity() / enqueue_entity() and update_stats_dequeue() / update_stats_enqueue() look similar, structure-wise. zero effect, functionality-wise: text data bss dec hex filename 34550 3026 100 37676 932c sched.o.before 34550 3026 100 37676 932c sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar commit d4b74a79c400252dcd6e4a0020f8c338f0743494 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:19 2007 +0200 sched: cleanup, remove calc_weighted() remove obsolete code -- calc_weighted() Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar commit dd3fec36addd1bf76b05225b7e483378b80c3f9e Author: Dmitry Adamushko Date: Wed Oct 10 14:00:18 2007 +0200 sched: tidy up SCHED_RR - make timeslices of SCHED_RR tasks constant and not dependent on task's static_prio [1] ; - remove obsolete code (timeslice related bits); - make sched_rr_get_interval() return something more meaningful [2] for SCHED_OTHER tasks. [1] according to the following link, it's not compliant with SUSv3 (not sure though, what is the reference for us :-) http://lkml.org/lkml/2007/3/7/656 [2] the interval is dynamic and can be depicted as follows "should a task be one of the runnable tasks at this particular moment, it would expect to run for this interval of time before being re-scheduled by the scheduler tick". (i.e. it's more precise if a task is runnable at the moment) yeah, this seems to require task_rq_lock/unlock() but this is not a hot path. results: (SCHED_FIFO) dimm@earth:~/storage/prog$ sudo chrt -f 10 ./rr_interval time_slice: 0 : 0 (SCHED_RR) dimm@earth:~/storage/prog$ sudo chrt 10 ./rr_interval time_slice: 0 : 99984800 (SCHED_NORMAL) dimm@earth:~/storage/prog$ ./rr_interval time_slice: 0 : 19996960 (SCHED_NORMAL + a cpu_hog of similar 'weight' on the same CPU --- so should be a half of the previous result) dimm@earth:~/storage/prog$ taskset 1 ./rr_interval time_slice: 0 : 9998480 Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar commit bf3e51b4607ff7560dd4781c3213eb4a0f92d1a7 Author: Alexey Dobriyan Date: Wed Oct 10 14:00:18 2007 +0200 sched: uninline scheduler * save ~300 bytes * activate_idle_task() was moved to avoid a warning bloat-o-meter output: add/remove: 6/0 grow/shrink: 0/16 up/down: 438/-733 (-295) <=== function old new delta __enqueue_entity - 165 +165 finish_task_switch - 110 +110 update_curr_rt - 79 +79 __load_balance_iterator - 32 +32 __task_rq_unlock - 28 +28 find_process_by_pid - 24 +24 do_sched_setscheduler 133 123 -10 sys_sched_rr_get_interval 176 165 -11 sys_sched_getparam 156 145 -11 normalize_rt_tasks 482 470 -12 sched_getaffinity 112 99 -13 sys_sched_getscheduler 86 72 -14 sched_setaffinity 226 212 -14 sched_setscheduler 666 642 -24 load_balance_start_fair 33 9 -24 load_balance_next_fair 33 9 -24 dequeue_task_rt 133 67 -66 put_prev_task_rt 97 28 -69 schedule_tail 133 50 -83 schedule 682 594 -88 enqueue_entity 499 366 -133 task_new_fair 317 180 -137 Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar commit 0714742c16894af7b460e1adf40809f95216db2e Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: tweak wakeup granularity tweak wakeup granularity. Signed-off-by: Ingo Molnar commit 431917fdb44792aa2654f32e2a7366ff884de1a7 Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: optimize schedule() a bit on SMP optimize schedule() a bit on SMP, by moving the rq-clock update outside the rq lock. code size is the same: text data bss dec hex filename 25725 2666 96 28487 6f47 sched.o.before 25725 2666 96 28487 6f47 sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 1794c6cb73b9af2a5ede2266b8ee49dbce1c5563 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:18 2007 +0200 sched: fix __pick_next_entity() The thing is that __pick_next_entity() must never be called when first_fair(cfs_rq) == NULL. It wouldn't be a problem, should 'run_node' be the very first field of 'struct sched_entity' (and it's the second). The 'nr_running != 0' check is _not_ enough, due to the fact that 'current' is not within the tree. Generic paths are ok (e.g. schedule() as put_prev_task() is called previously)... I'm more worried about e.g. migration_call() -> CPU_DEAD_FROZEN -> migrate_dead_tasks()... if 'current' == rq->idle, no problems.. if it's one of the SCHED_NORMAL tasks (or imagine, some other use-cases in the future -- i.e. we should not make outer world dependent on internal details of sched_fair class) -- it may be "Houston, we've got a problem" case. it's +16 bytes to the ".text". Another variant is to make 'run_node' the first data member of 'struct sched_entity' but an additional check (se ! = NULL) is still needed in pick_next_entity(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 77524e10531c8e98a6462fbbbdcb41e8e46ac41d Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: vslice fixups for non-0 nice levels Make vslice accurate wrt nice levels, and add some comments while we're at it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 39e1930b5d5890ff145f5271b8921a88876409da Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: whitespace cleanups more whitespace cleanups. No code changed: text data bss dec hex filename 26553 2790 288 29631 73bf sched.o.before 26553 2790 288 29631 73bf sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 54d3824c0b5f614238579015eed13f31f39dfe9f Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: mark scheduling classes as const mark scheduling classes as const. The speeds up the code a bit and shrinks it: text data bss dec hex filename 40027 4018 292 44337 ad31 sched.o.before 40190 3842 292 44324 ad24 sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit aac37922b58a4992a6cf602f02cf76560c61b6df Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:18 2007 +0200 sched: group scheduler, fix latency There is a possibility that because of task of a group moving from one cpu to another, it may gain more cpu time that desired. See http://marc.info/?l=linux-kernel&m=119073197730334 for details. This is an attempt to fix that problem. Basically it simulates dequeue of higher level entities as if they are going to sleep. Similarly it simulate wakeup of higher level entities as if they are waking up from sleep. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit fbea853ea5aa6186ae77ba2bf807444554e8f5e4 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:18 2007 +0200 sched: group scheduler, fix bloat Recent fix to check_preempt_wakeup() to check for preemption at higher levels caused a size bloat for !CONFIG_FAIR_GROUP_SCHED. Fix the problem. 42277 10598 320 53195 cfcb kernel/sched.o-before_this_patch 42216 10598 320 53134 cf8e kernel/sched.o-after_this_patch Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit d162aa50d505c94d19d94f15c72c62fb4c01db4b Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:18 2007 +0200 sched: group scheduler, fix coding style issues Fix coding style issues reported by Randy Dunlap and others Signed-off-by: Dhaval Giani Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit d400d918c6ec48e0addd82640c212e1b61cd0704 Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: cleanup, remove stale comment cleanup, remove stale comment. Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 621ff6c085c09156b5f7ce0ffbae3ad24a6e603b Author: Peter Zijlstra Date: Wed Oct 10 14:00:18 2007 +0200 sched: speed up and simplify vslice calculations speed up and simplify vslice calculations. [ From: Mike Galbraith : build fix ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit ef719c247f34d0dfc22db8a391e4c7a0b41e2a95 Author: Peter Zijlstra Date: Wed Oct 10 14:00:18 2007 +0200 sched: clean up min_vruntime use clean up min_vruntime use. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar commit 212e7e64ada7b64c29b70b5fdf2629b662acafbd Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:18 2007 +0200 sched: group scheduler SMP migration fix group scheduler SMP migration fix: use task_cfs_rq(p) to get to the relevant fair-scheduling runqueue of a task, rq->cfs is not the right one. Signed-off-by: Ingo Molnar commit 741b0673bf821001145ac13e319c15b802ac241e Author: Ingo Molnar Date: Wed Oct 10 14:00:18 2007 +0200 sched: clean up schedstats, cnt -> count rename all 'cnt' fields and variables to the less yucky 'count' name. yuckage noticed by Andrew Morton. no change in code, other than the /proc/sched_debug bkl_count string got a bit larger: text data bss dec hex filename 38236 3506 24 41766 a326 sched.o.before 38240 3506 24 41770 a32a sched.o.after Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 0d65293d5a326374b1032418edd818a1fae128a7 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:17 2007 +0200 sched: yield fix fix yield bugs due to the current-not-in-rbtree changes: the task is not in the rbtree so rbtree-removal is a no-no. [ From: Srivatsa Vaddagiri : build fix. ] also, nice code size reduction: kernel/sched.o: text data bss dec hex filename 38323 3506 24 41853 a37d sched.o.before 38236 3506 24 41766 a326 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Dmitry Adamushko Reviewed-by: Thomas Gleixner commit d64920301c2640c81dee529ea9ec3fdbc976719d Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:17 2007 +0200 sched: group scheduler wakeup latency fix group scheduler wakeup latency fix: when checking for preemption we must check cross-group too, not just intra-group. Signed-off-by: Ingo Molnar commit 14998617f7792c6d1ad6d93ec37d297f4470f86e Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: remove set_leftmost() Lee Schermerhorn noticed that set_leftmost() contains dead code, remove this. Reported-by: Lee Schermerhorn Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit a045e63a662f6827bab6bdb281e8ed51d9af5bc1 Author: Hiroshi Shimamoto Date: Wed Oct 10 14:00:17 2007 +0200 sched: clean up sched_fork() The adjusting sched_class is a missing part of the already existing "do not leak PI boosting priority to the child" at the sched_fork(). This patch moves the adjusting sched_class from wake_up_new_task() to sched_fork(). this also shrinks the code a bit: text data bss dec hex filename 40111 4018 292 44421 ad85 sched.o.before 40102 4018 292 44412 ad7c sched.o.after Signed-off-by: Hiroshi Shimamoto Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit bd19da4c83c1e7117460d65c195ee7a27bb1cd0f Author: Peter Zijlstra Date: Wed Oct 10 14:00:17 2007 +0200 sched: max_vruntime() simplification max_vruntime() simplification. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 51021b9915514703115bffcefa010b82cec32a36 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: fix sched_fork() fix sched_fork(): large latencies at new task creation time because the ->vruntime was not fixed up cross-CPU, if the parent got migrated after the child's CPU got set up. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 6cc19bcbe30dd9992e34008c1821c72b64c2e1f6 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: fix sign check error in place_entity() fix sign check error in place_entity() - we'd get excessive latencies due to negatives being converted to large u64's. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 018968ae9c4265a469d4616e97ebeeaabc041808 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: undo some of the recent changes undo some of the recent changes that are not needed after all, such as last_min_vruntime. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit d4bb2d3089c2046f9ed33a5a7d1931c97359d818 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: remove last_min_vruntime effect remove last_min_vruntime use - prepare to remove it. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit b16cefdfe3bb7a31fa5065b2abb54ae9ba5d8ac9 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: remove condition from set_task_cpu() remove condition from set_task_cpu(). Now that ->vruntime is not global anymore, it should (and does) work fine without it too. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 150889501583032c009c661eb18691d1ae1ab5f6 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched: entity_key() fix entity_key() fix - we'd occasionally end up with a 0 vruntime in the !initial case. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 58bcff8f7d535f3ccaf7f637e0af6e840393b007 Author: Peter Zijlstra Date: Wed Oct 10 14:00:17 2007 +0200 sched debug: check spread debug feature: check how well we schedule within a reasonable vruntime 'spread' range. (note that CPU overload can increase the spread, so this is not a hard condition, but normal loads should be within the spread.) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 79f3794acf0ba8e93e245e707cde881661501058 Author: Ingo Molnar Date: Wed Oct 10 14:00:17 2007 +0200 sched debug: more width for parameter printouts more width for parameter printouts in /proc/sched_debug. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit b0454bdba164b9f3989e7b7a463d4cfd7fe44c3a Author: Peter Zijlstra Date: Wed Oct 10 14:00:16 2007 +0200 sched: add vslice add vslice: the load-dependent "virtual slice" a task should run ideally, so that the observed latency stays within the sched_latency window. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit f13e95553356563c83503021a770c5c80d8622a8 Author: Ingo Molnar Date: Wed Oct 10 14:00:16 2007 +0200 sched debug: print settings print the current value of all tunables in /proc/sched_debug output. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 9b97de50d244e24813294a43bdec3e51ec67babb Author: Ingo Molnar Date: Wed Oct 10 14:00:16 2007 +0200 sched: remove unneeded tunables remove unneeded tunables. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 8fa180c0f32035c0db94d775ea41ecdc0bf55d8a Author: S.Caglar Onur Date: Wed Oct 10 14:00:16 2007 +0200 sched debug: BKL usage statistics, fix build fix for the SCHED_DEBUG && !SCHEDSTATS case. Signed-off-by: S.Ceglar Onur Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit a77efd7a8ad2c0e96d2202a8a1a01a320b8a28fd Author: Ingo Molnar Date: Wed Oct 10 14:00:16 2007 +0200 sched debug: BKL usage statistics add per task and per rq BKL usage statistics. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 71805d19e8d9973008dc62af4dce68cee987f5a0 Author: Ingo Molnar Date: Wed Oct 10 14:00:16 2007 +0200 sched: enable CONFIG_FAIR_GROUP_SCHED=y by default enable CONFIG_FAIR_GROUP_SCHED=y by default. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 94930adf39689e4412dd24fc89eb2ac6ad501b88 Author: Ingo Molnar Date: Wed Oct 10 14:00:16 2007 +0200 sched: fair-group sched, cleanups fair-group sched, cleanups. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 48e9cd98f676a4903d70a6b4fd324095b4883369 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: add fair-user scheduler Enable user-id based fair group scheduling. This is useful for anyone who wants to test the group scheduler w/o having to enable CONFIG_CGROUPS. A separate scheduling group (i.e struct task_grp) is automatically created for every new user added to the system. Upon uid change for a task, it is made to move to the corresponding scheduling group. A /proc tunable (/proc/root_user_share) is also provided to tune root user's quota of cpu bandwidth. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit ec6624b1ad2c41510b619f817aff71952d735447 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: clean up code under CONFIG_FAIR_GROUP_SCHED With the view of supporting user-id based fair scheduling (and not just container-based fair scheduling), this patch renames several functions and makes them independent of whether they are being used for container or user-id based fair scheduling. Also fix a problem reported by KAMEZAWA Hiroyuki (wrt allocating less-sized array for tg->cfs_rq[] and tf->se[]). Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 89e7763df9d9a1055b2930fab816515b52f73ca7 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: print &rq->cfs stats - Print &rq->cfs statistics as well (useful for group scheduling) Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit c8c6829c343309097cfc4f63ebfdf15f0d48338d Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: print nr_running and load in /proc/sched_debug - print nr_running and load information for cfs_rq in /proc/sched_debug Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 6cef7aea6c72f174f9d897875a818198fab19c65 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: fix minor bug in yield - fix a minor bug in yield (seen for CONFIG_FAIR_GROUP_SCHED), group scheduling would skew when yield was called. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 380a55312a123e463175edc44503bb1b524705c5 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:16 2007 +0200 sched: revert recent removal of set_curr_task() Revert removal of set_curr_task. Use put_prev_task/set_curr_task when changing groups/policies Signed-off-by: Srivatsa Vaddagiri < vatsa@linux.vnet.ibm.com> Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit c98938eeced06bf5415019cd053f24d80c898f30 Author: Ingo Molnar Date: Wed Oct 10 14:00:15 2007 +0200 sched: kernel/sched_fair.c whitespace cleanups some trivial whitespace cleanups. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 336b5793d421d1e015d0cd653d4d7f06968989cd Author: Mike Galbraith Date: Wed Oct 10 14:00:15 2007 +0200 sched: fix formatting of /proc/sched_debug fix formatting of /proc/sched_debug Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit ac842d299f2a2da32ac8e317af194d66fb1f719b Author: Ingo Molnar Date: Wed Oct 10 14:00:15 2007 +0200 sched: enhance debug output enhance debug output by changing 12345678 nsecs to 12.345678 output, this is more human-readable. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit c1dc79c2e77a4ed6accd5616d7d2f3ece3ff9d77 Author: Ingo Molnar Date: Wed Oct 10 14:00:15 2007 +0200 sched: prettify /proc/sched_debug output print the correct amount of dashes in /proc/sched_debug. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 9ec21ce9bff18bb6a489024b146a52a8b3984168 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: rework enqueue/dequeue_entity() to get rid of set_curr_task() rework enqueue/dequeue_entity() to get rid of sched_class::set_curr_task(). This simplifies sched_setscheduler(), rt_mutex_setprio() and sched_move_tasks(). text data bss dec hex filename 24330 2734 20 27084 69cc sched.o.before 24233 2730 20 26983 6967 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 5aeb3f33ace20bce2b2bd25d573b38998d4a0c3b Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: simplify sched_class::yield_task() the 'p' (task_struct) parameter in the sched_class :: yield_task() is redundant as the caller is always the 'current'. Get rid of it. text data bss dec hex filename 24341 2734 20 27095 69d7 sched.o.before 24330 2734 20 27084 69cc sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 91ce9cc2879488213b85d6741fc46ed9779ac3de Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: optimize task_new_fair() due to the fact that we no longer keep the 'current' within the tree, dequeue/enqueue_entity() is useless for the 'current' in task_new_fair(). We are about to reschedule and sched_class->put_prev_task() will put the 'current' back into the tree, based on its new key. text data bss dec hex filename 24388 2734 20 27142 6a06 sched.o.before 24341 2734 20 27095 69d7 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 61843af1a8b468bd56da39c072033409ac238f7e Author: Ingo Molnar Date: Wed Oct 10 14:00:15 2007 +0200 sched: fix delay accounting performance regression fix delay accounting performance regression - those sched_clock() calls are not needed. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit e271703e74c6b93d6d9c1da0bac479b2b58f5b7c Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: do not keep current in the tree and get rid of sched_entity::fair_key Get rid of 'sched_entity::fair_key'. As a side effect, 'current' is not kept withing the tree for SCHED_NORMAL/BATCH tasks anymore. This simplifies some parts of code (e.g. entity_tick() and yield_task_fair()) and also somewhat optimizes them (e.g. a single update_curr() now vs. dequeue/enqueue() before in entity_tick()). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit c3594be698b3c36706b5963b1a6d02058bb2d817 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: add set_curr_task() calls p->sched_class->set_curr_task() has to be called before activate_task()/enqueue_task() in rt_mutex_setprio(), sched_setschedule() and sched_move_task() in order to set up 'cfs_rq->curr'. The logic of enqueueing depends on whether a task to be inserted is 'current' or not. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 967fd5b5bde35488a04b884dc9dcb7e9da96ed9a Author: Dmitry Adamushko Date: Wed Oct 10 14:00:15 2007 +0200 sched: sched_setscheduler() fix Fix a problem in the 'sched-group' patch for !CONFIG_FAIR_GROUP_SCHED. description: sched_setscheduler() { ... if (task_running()) p->sched_class->put_prev_entity(); [ this one sets up cfs_rq->curr to NULL ] ... if (task_running) p->sched_class->set_curr_task(); [ and this one is a _NOP_ (empty) for !CONFIG_FAIR_GROUP_SCHED ] As a result, the task continues to run with cfs_rq->curr == NULL... no crashes (due to checks for !NULL in place) but e.g. update_curr() effectively becomes a NOP... i.e. runtime statistics for this task is not accounted untill it's rescheduled anew. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 49a4094299878e76b1fe4d1be673db9e2a7eda63 Author: Srivatsa Vaddagiri Date: Wed Oct 10 14:00:15 2007 +0200 sched: group-scheduler core Add interface to control cpu bandwidth allocation to task-groups. (not yet configurable, due to missing CONFIG_CONTAINERS) Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra commit 3bcb79a8c2edc4078412334d770ca59f0a43b3ee Author: Mike Galbraith Date: Wed Oct 10 14:00:15 2007 +0200 sched: fix SMP migration latencies fix SMP migration latencies: the vruntimes of different CPUs are at incompatible offsets so they have to be fixed up when migrating a task across CPUs. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 28ace9ce7c33018e7619ec12638417ba067b0583 Author: Peter Zijlstra Date: Wed Oct 10 14:00:15 2007 +0200 sched: better min_vruntime tracking Better min_vruntime tracking: update it every time 'curr' is updated - not just when a task is enqueued into the tree. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 98f5fe2e066769d1acbf863e0d2ed29efde7ebe8 Author: Ingo Molnar Date: Wed Oct 10 14:00:15 2007 +0200 sched: x86: allow single-depth wchan output sched.o gets smaller and faster if we compile it with -fomit-frame-pointers, so make this a config option. The cost is the loss of multi-depth wchan lookups - but SysRq-T is a sufficient replacement for them anyway, so their utility is much lower these days. the size difference is significant: text data bss dec hex filename 34005 3462 24 37491 9273 sched.o.before 33470 3462 24 36956 905c sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit af60e9b2e9973ef111de234dd8cedd0c51fcb9f2 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:14 2007 +0200 sched: clean up schedstat block in dequeue_entity() Better placement of #ifdef CONFIG_SCHEDSTAT block in dequeue_entity(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit c859850461207e710bf0638780200f159cc354bb Author: Ingo Molnar Date: Wed Oct 10 14:00:14 2007 +0200 sched: remove wait_runtime fields and features remove wait_runtime based fields and features, now that the CFS math has been changed over to the vruntime metric. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 430bcb9ac0e6f692dc7273dc8690401d0a1f632e Author: Ingo Molnar Date: Wed Oct 10 14:00:14 2007 +0200 sched: remove wait_runtime limit remove the wait_runtime-limit fields and the code depending on it, now that the math has been changed over to rely on the vruntime metric. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 74b5434de4ccacd85366c018dcc640e58af0f765 Author: Dmitry Adamushko Date: Wed Oct 10 14:00:14 2007 +0200 sched: clean up struct load_stat 'struct load_stat' is redundant now so let's get rid of it. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit da7f28a409efe61f127794f790fe099628ce6f3e Author: Ingo Molnar Date: Wed Oct 10 14:00:14 2007 +0200 sched: debug: update exec_clock only when SCHED_DEBUG micro-optimization: update cfs_rq->exec_clock only if CONFIG_SCHED_DEBUG=y. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit e2fd00750696b3f04c31eb7d59e6b6d30796a323 Author: Ingo Molnar Date: Wed Oct 10 14:00:14 2007 +0200 sched: add more vruntime statistics add more vruntime statistics. Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Reviewed-by: Thomas Gleixner commit 684b4d573eb1e8337dd33d5f2418a642ea4748ac Author: Peter Zijlstra Date: Wed Oct 10 14:00:14 2007 +0200 sched: handle vruntime 64-bit overflow Handle vruntime overflow by centering the key space around min_vruntime. ( otherwise we could overflow 64-bit vruntime in a few days with SCHED_IDLE tasks - or in a few years with nice +19. ) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit e017479aa3a8cdaf026fb5d5b75ef6c193c5ab4d Author: Peter Zijlstra Date: Wed Oct 10 14:00:14 2007 +0200 sched: add tree based averages add support for tree based vruntime averages. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 3d5cfceb857f312bfbaecdbaae3dc0ef8292943c Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: remove SCHED_FEAT_SKIP_INITIAL remove SCHED_FEAT_SKIP_INITIAL - it was off by default and even when enabled it never made any real difference. Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner commit 02ac961ae5514ffe2340faedc65c204ce7e9064a Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: add se->vruntime debugging debug se->vruntime fields. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit 7fccca28ef99155e079fe24346a39c6b76405906 Author: Peter Zijlstra Date: Wed Oct 10 14:00:13 2007 +0200 sched: clean up new task placement clean up new task placement. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith commit b757124758dd5ff47e56467f42893c7e059aef1a Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: wakeup granularity increase increase wakeup granularity - we were overscheduling a bit. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit dbc68a4d464c593e52887eb66518992086d5fceb Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: simplify check_preempt() methods simplify the check_preempt() methods. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith commit 19001ccad7b05a6f9d75f12615d43a70f034ca79 Author: Peter Zijlstra Date: Wed Oct 10 14:00:13 2007 +0200 sched: simplify adaptive latency simplify adaptive latency. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 9a70517465125f4677acf783d0b3b6d6244ee3ee Author: Peter Zijlstra Date: Wed Oct 10 14:00:13 2007 +0200 sched: new task placement for vruntime add proper new task placement for the vruntime based math too. ( note: introduces a swap() macro, but the swap token is too widely used in the kernel namespace for a generic version to be added without changing non-scheduler code - so this cleanup will be done separately. ) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 49d1e3e6d23690a72f54ca39f80d81248f680c50 Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: optimize vruntime based scheduling optimize vruntime based scheduling. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 503537df68fac1779b10ca307b7b971cd59bf28b Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: move sched_feat() definitions move sched_feat() definitions so that it can be used sooner by generic code too. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit ecaaa3912c25e08689675b5c54834ceff70d242d Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: introduce se->vruntime introduce se->vruntime as a sum of weighted delta-exec's, and use that as the key into the tree. the idea to use absolute virtual time as the basic metric of scheduling has been first raised by William Lee Irwin, advanced by Tong Li and first prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset. also see: http://lkml.org/lkml/2007/9/2/76 for a simpler variant of this patch. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 8f5c0e4748fdc239e505be6d4585f2bf10768c08 Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: clean up calc_weighted() clean up calc_weighted() - we always use the normalized shift so it's not needed to pass that in. Also, push the non-nice0 branch into the function. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit f35e4a9b32e503148acb7cb7e79b5dd585d06d15 Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: speed up update_load_add/_sub() speed up update_load_add/_sub() by not delaying the division - this reduces CPU pipeline dependencies. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit aed6e15b2b67ca77b9e2e5dcaba7a726663c40aa Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: uninline __enqueue_entity()/__dequeue_entity() suggested by Roman Zippel: uninline __enqueue_entity() and __dequeue_entity(). this reduces code size: text data bss dec hex filename 25385 2386 16 27787 6c8b sched.o.before 25257 2386 16 27659 6c0b sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit cb410ae4051db8ada6b6eb489eb3852cdf755f81 Author: Peter Zijlstra Date: Wed Oct 10 14:00:13 2007 +0200 sched: simplify SCHED_FEAT_* code Peter Zijlstra suggested to simplify SCHED_FEAT_* checks via the sched_feat(x) macro. No code changed: text data bss dec hex filename 38895 3550 24 42469 a5e5 sched.o.before 38895 3550 24 42469 a5e5 sched.o.after Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 7993c371d4094b1f19629f6c4705771878ffbe4f Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: cleanup: simplify cfs_rq_curr() methods cleanup: simplify cfs_rq_curr() methods - now that the cfs_rq->curr pointer is unconditionally present, remove the wrappers. kernel/sched.o: text data bss dec hex filename 11784 224 2012 14020 36c4 sched.o.before 11784 224 2012 14020 36c4 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit c5fabae8653340985580724246302931d0089f7e Author: Ingo Molnar Date: Wed Oct 10 14:00:13 2007 +0200 sched: track cfs_rq->curr on !group-scheduling too Noticed by Roman Zippel: use cfs_rq->curr in the !group-scheduling case too. Small micro-optimization and cleanup effect: text data bss dec hex filename 36269 3482 24 39775 9b5f sched.o.before 36177 3486 24 39687 9b07 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 6be0e0989133b6ce8b689efb6f60b7421ca46097 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: remove precise CPU load calculations #2 continued removal of precise CPU load calculations. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit d2dbcbb53266d70e3806cb6631e636539e2babcd Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: remove precise CPU load CPU load calculations are statistical anyway, and there's little benefit from having it calculated on every scheduling event. So remove this code, it gets rid of a divide from the scheduler wakeup and context-switch fastpath. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 7b8088902d17bb53658d1613c18df3840470674b Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: remove stat_gran remove the stat_gran code - it was disabled by default and it causes unnecessary overhead. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 6c8056417fe41df4fe185761ad9de56c2ca7670a Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: use constants if !CONFIG_SCHED_DEBUG use constants if !CONFIG_SCHED_DEBUG. this speeds up the code and reduces code-size: text data bss dec hex filename 27464 3014 16 30494 771e sched.o.before 26929 3010 20 29959 7507 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit c2250c03479cd1025e26e92b9b12b58ce107c226 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: uniform tunings use the same defaults on both UP and SMP. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit fcba95016d91ce6e87dfad257da08ddac884e597 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: debug: track maximum 'slice' track the maximum amount of time a task has executed while the CPU load was at least 2x. (i.e. at least two nice-0 tasks were runnable) Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit d1df37fa8c61a3e09b672ad94436a2a917231867 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: small sched_debug cleanup small kernel/sched_debug.c cleanup - break up multi-variable assignment. no code changed: text data bss dec hex filename 38869 3550 24 42443 a5cb sched.o.before 38869 3550 24 42443 a5cb sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit 6976890e712bcbc39af19a56c301e07171b5f15f Author: Matthias Kaehlcke Date: Wed Oct 10 14:00:12 2007 +0200 sched: use list_for_each_entry_safe() in __wake_up_common() Use list_for_each_entry_safe() instead of list_for_each_safe() in __wake_up_common() Signed-off-by: Matthias Kaehlcke Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit c49f202a99627fd34e6a8823c563674518624b68 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: resched task in task_new_fair() to get full child-runs-first semantics make sure the parent is rescheduled. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner commit cd11ff158c6946d23054c8b9d9f9c6fe2ba67924 Author: Ingo Molnar Date: Wed Oct 10 14:00:12 2007 +0200 sched: fix sysctl_sched_child_runs_first flag fix the sched_child_runs_first flag: always call into ->task_new() if we are on the same CPU, as SCHED_OTHER tasks depend on it for correct initial setup. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Mike Galbraith Reviewed-by: Thomas Gleixner Documentation/sched-design-CFS.txt | 67 ++ arch/i386/Kconfig | 11 fs/pipe.c | 3 fs/proc/base.c | 2 include/linux/sched.h | 76 +- init/Kconfig | 21 + kernel/delayacct.c | 2 kernel/ksysfs.c | 8 kernel/sched.c | 1271 +++++++++++++++++++----------------- kernel/sched_debug.c | 198 +++--- kernel/sched_fair.c | 800 +++++++++-------------- kernel/sched_idletask.c | 8 kernel/sched_rt.c | 19 - kernel/sched_stats.h | 28 - kernel/sysctl.c | 31 - kernel/user.c | 249 +++++++ 16 files changed, 1561 insertions(+), 1233 deletions(-) diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7..88bcb87 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt @@ -117,3 +117,70 @@ Some implementation details: iterators of the scheduling modules are used. The balancing code got quite a bit simpler as a result. + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. + +Only one of these options to group tasks can be chosen and not both. + +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. + + # cd /sys/kernel/uids + # cat 512/cpu_share # Display user 512's CPU share + 1024 + # echo 2048 > 512/cpu_share # Modify user 512's CPU share + # cat 512/cpu_share # Display user 512's CPU share + 2048 + # + +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + + +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem + + # mkdir /dev/cpuctl + # mount -t cgroup -ocpu none /dev/cpuctl + # cd /dev/cpuctl + + # mkdir multimedia # create "multimedia" group of tasks + # mkdir browser # create "browser" group of tasks + + # #Configure the multimedia group to receive twice the CPU bandwidth + # #that of browser group + + # echo 2048 > multimedia/cpu.shares + # echo 1024 > browser/cpu.shares + + # firefox & # Launch firefox and move it to "browser" group + # echo > browser/tasks + + # #Launch gmplayer (or your favourite movie player) + # echo > multimedia/tasks diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 97b64d7..c9fc6a8 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -214,6 +214,17 @@ config X86_ES7000 endchoice +config SCHED_NO_NO_OMIT_FRAME_POINTER + bool "Single-depth WCHAN output" + default y + help + Calculate simpler /proc//wchan values. If this option + is disabled then wchan values will recurse back to the + caller function. This provides more accurate wchan values, + at the expense of slightly more scheduling overhead. + + If in doubt, say "Y". + config PARAVIRT bool "Paravirtualization support (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/fs/pipe.c b/fs/pipe.c index 6b3d91a..f1fa2b4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -45,8 +45,7 @@ void pipe_wait(struct pipe_inode_info *p * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); diff --git a/fs/proc/base.c b/fs/proc/base.c index 19489b0..e5d0953 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct tas return sprintf(buffer, "%llu %llu %lu\n", task->sched_info.cpu_time, task->sched_info.run_delay, - task->sched_info.pcnt); + task->sched_info.pcount); } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 313c6b6..c94b566 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -86,6 +86,7 @@ #include #include #include #include +#include #include @@ -135,6 +136,7 @@ extern unsigned long weighted_cpuload(co struct seq_file; struct cfs_rq; +struct task_group; #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); @@ -173,8 +175,7 @@ #define TASK_TRACED 8 #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -595,8 +596,21 @@ #endif /* Hash table maintenance information */ struct hlist_node uidhash_node; uid_t uid; + +#ifdef CONFIG_FAIR_USER_SCHED + struct task_group *tg; + struct kset kset; + struct subsys_attribute user_attr; + struct work_struct work; +#endif }; +#ifdef CONFIG_FAIR_USER_SCHED +extern int uids_kobject_init(void); +#else +static inline int uids_kobject_init(void) { return 0; } +#endif + extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; @@ -608,13 +622,17 @@ struct reclaim_state; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info { /* cumulative counters */ - unsigned long pcnt; /* # of times run on this cpu */ + unsigned long pcount; /* # of times run on this cpu */ unsigned long long cpu_time, /* time spent on the cpu */ run_delay; /* time spent waiting on a runqueue */ /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ +#ifdef CONFIG_SCHEDSTATS + /* BKL stats */ + unsigned long bkl_count; +#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -749,7 +767,7 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; + unsigned long lb_count[CPU_MAX_IDLE_TYPES]; unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; @@ -759,17 +777,17 @@ #ifdef CONFIG_SCHEDSTATS unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_cnt; + unsigned long alb_count; unsigned long alb_failed; unsigned long alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_cnt; + unsigned long sbe_count; unsigned long sbe_balanced; unsigned long sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_cnt; + unsigned long sbf_count; unsigned long sbf_balanced; unsigned long sbf_pushed; @@ -853,11 +871,11 @@ struct rq; struct sched_domain; struct sched_class { - struct sched_class *next; + const struct sched_class *next; void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); - void (*yield_task) (struct rq *rq, struct task_struct *p); + void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); @@ -887,31 +905,22 @@ struct load_weight { * 4 se->block_start * 4 se->run_node * 4 se->sleep_start - * 4 se->sleep_start_fair * 6 se->load.weight - * 7 se->delta_fair - * 15 se->wait_runtime */ struct sched_entity { - long wait_runtime; - unsigned long delta_fair_run; - unsigned long delta_fair_sleep; - unsigned long delta_exec; - s64 fair_key; struct load_weight load; /* for load-balancing */ struct rb_node run_node; unsigned int on_rq; + int peer_preempt; u64 exec_start; u64 sum_exec_runtime; + u64 vruntime; u64 prev_sum_exec_runtime; - u64 wait_start_fair; - u64 sleep_start_fair; #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; - s64 sum_wait_runtime; u64 sleep_start; u64 sleep_max; @@ -920,9 +929,7 @@ #ifdef CONFIG_SCHEDSTATS u64 block_start; u64 block_max; u64 exec_max; - - unsigned long wait_runtime_overruns; - unsigned long wait_runtime_underruns; + u64 slice_max; #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -951,7 +958,7 @@ #endif int prio, static_prio, normal_prio; struct list_head run_list; - struct sched_class *sched_class; + const struct sched_class *sched_class; struct sched_entity se; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1400,15 +1407,16 @@ #endif extern void sched_idle_next(void); +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_nr_latency; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_batch_wakeup_granularity; -extern unsigned int sysctl_sched_stat_granularity; -extern unsigned int sysctl_sched_runtime_limit; -extern unsigned int sysctl_sched_compat_yield; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; +#endif + +extern unsigned int sysctl_sched_compat_yield; #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); @@ -1842,6 +1850,18 @@ extern int sched_mc_power_savings, sched extern void normalize_rt_tasks(void); +#ifdef CONFIG_FAIR_GROUP_SCHED + +extern struct task_group init_task_group; + +extern struct task_group *sched_create_group(void); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_move_task(struct task_struct *tsk); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern unsigned long sched_group_shares(struct task_group *tg); + +#endif + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/init/Kconfig b/init/Kconfig index d54d0ca..54f31a1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -281,6 +281,27 @@ config CPUSETS Say N if unsure. +config FAIR_GROUP_SCHED + bool "Fair group CPU scheduler" + default y + depends on EXPERIMENTAL + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. + +choice + depends on FAIR_GROUP_SCHED + prompt "Basis for grouping tasks" + default FAIR_USER_SCHED + +config FAIR_USER_SCHED + bool "user id" + help + This option will choose userid as the basis for grouping + tasks, thus providing equal CPU bandwidth to each user. + +endchoice + config SYSFS_DEPRECATED bool "Create deprecated sysfs files" default y diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 81e6978..09e9574 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats * No locking available for sched_info (and too expensive to add one) * Mitigate by taking snapshot of values */ - t1 = tsk->sched_info.pcnt; + t1 = tsk->sched_info.pcount; t2 = tsk->sched_info.run_delay; t3 = tsk->sched_info.cpu_time; diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48..6046939 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -116,6 +117,13 @@ static int __init ksysfs_init(void) ¬es_attr); } + /* + * Create "/sys/kernel/uids" directory and corresponding root user's + * directory under it. + */ + if (!error) + error = uids_kobject_init(); + return error; } diff --git a/kernel/sched.c b/kernel/sched.c index 6107a0c..d7fcd82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -95,7 +95,7 @@ #define MAX_USER_PRIO (USER_PRIO(MAX_PR /* * Some helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) #define NICE_0_LOAD SCHED_LOAD_SCALE @@ -104,11 +104,9 @@ #define NICE_0_SHIFT SCHED_LOAD_SHIFT /* * These are the 'tuning knobs' of the scheduler: * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) #define DEF_TIMESLICE (100 * HZ / 1000) #ifdef CONFIG_SMP @@ -132,24 +130,6 @@ static inline void sg_inc_cpu_power(stru } #endif -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -/* - * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - */ -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio == NICE_TO_PRIO(19)) - return 1; - - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) @@ -170,31 +150,91 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -struct load_stat { - struct load_weight load; - u64 load_update_start, load_update_last; - unsigned long delta_fair, delta_exec, delta_stat; +#ifdef CONFIG_FAIR_GROUP_SCHED + +struct cfs_rq; + +/* task group related information */ +struct task_group { + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + /* spinlock to serialize modification to shares */ + spinlock_t lock; }; +/* Default task group's sched entity on each cpu */ +static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); +/* Default task group's cfs_rq on each cpu */ +static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; + +static struct sched_entity *init_sched_entity_p[NR_CPUS]; +static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group init_task_group = { + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, +}; + +#ifdef CONFIG_FAIR_USER_SCHED +# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD +#else +# define INIT_TASK_GRP_LOAD NICE_0_LOAD +#endif + +static int init_task_group_load = INIT_TASK_GRP_LOAD; + +/* return group to which a task belongs */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + +#ifdef CONFIG_FAIR_USER_SCHED + tg = p->user->tg; +#else + tg = &init_task_group; +#endif + + return tg; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; + p->se.parent = task_group(p)->se[task_cpu(p)]; +} + +#else + +static inline void set_task_cfs_rq(struct task_struct *p) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; unsigned long nr_running; - s64 fair_clock; u64 exec_clock; - s64 wait_runtime; - u64 sleeper_bonus; - unsigned long wait_runtime_overruns, wait_runtime_underruns; + u64 min_vruntime; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; struct rb_node *rb_load_balance_curr; -#ifdef CONFIG_FAIR_GROUP_SCHED /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; + + unsigned long nr_spread_over; + +#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in @@ -205,6 +245,8 @@ #ifdef CONFIG_FAIR_GROUP_SCHED * list is used during load balance. */ struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ + struct task_group *tg; /* group that "owns" this runqueue */ + struct rcu_head rcu; #endif }; @@ -236,7 +278,7 @@ struct rq { #ifdef CONFIG_NO_HZ unsigned char in_nohz_recently; #endif - struct load_stat ls; /* capture load from *all* tasks on this cpu */ + struct load_weight load; /* capture load from *all* tasks on this cpu */ unsigned long nr_load_updates; u64 nr_switches; @@ -288,16 +330,19 @@ #ifdef CONFIG_SCHEDSTATS unsigned long yld_exp_empty; unsigned long yld_act_empty; unsigned long yld_both_empty; - unsigned long yld_cnt; + unsigned long yld_count; /* schedule() stats */ unsigned long sched_switch; - unsigned long sched_cnt; + unsigned long sched_count; unsigned long sched_goidle; /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; + unsigned long ttwu_count; unsigned long ttwu_local; + + /* BKL stats */ + unsigned long bkl_count; #endif struct lock_class_key rq_lock_key; }; @@ -382,6 +427,37 @@ #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) /* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/* + * Debugging: various feature bits + */ +enum { + SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, + SCHED_FEAT_START_DEBIT = 2, + SCHED_FEAT_TREE_AVG = 4, + SCHED_FEAT_APPROX_AVG = 8, + SCHED_FEAT_WAKEUP_PREEMPT = 16, + SCHED_FEAT_PREEMPT_RESTRICT = 32, +}; + +const_debug unsigned int sysctl_sched_features = + SCHED_FEAT_NEW_FAIR_SLEEPERS *1 | + SCHED_FEAT_START_DEBIT *1 | + SCHED_FEAT_TREE_AVG *0 | + SCHED_FEAT_APPROX_AVG *0 | + SCHED_FEAT_WAKEUP_PREEMPT *1 | + SCHED_FEAT_PREEMPT_RESTRICT *1; + +#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + +/* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): */ @@ -399,18 +475,7 @@ unsigned long long cpu_clock(int cpu) return now; } - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* Change a task's ->cfs_rq if it moves across CPUs */ -static inline void set_task_cfs_rq(struct task_struct *p) -{ - p->se.cfs_rq = &task_rq(p)->cfs; -} -#else -static inline void set_task_cfs_rq(struct task_struct *p) -{ -} -#endif +EXPORT_SYMBOL_GPL(cpu_clock); #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -496,16 +561,13 @@ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { - struct rq *rq; - -repeat_lock_task: - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + struct rq *rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock(&rq->lock); - goto repeat_lock_task; } - return rq; } /* @@ -518,18 +580,17 @@ static struct rq *task_rq_lock(struct ta { struct rq *rq; -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { + for (;;) { + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; } - return rq; } -static inline void __task_rq_unlock(struct rq *rq) +static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { spin_unlock(&rq->lock); @@ -544,7 +605,7 @@ static inline void task_rq_unlock(struct /* * this_rq_lock - lock this runqueue and disable interrupts. */ -static inline struct rq *this_rq_lock(void) +static struct rq *this_rq_lock(void) __acquires(rq->lock) { struct rq *rq; @@ -644,19 +705,6 @@ static inline void resched_task(struct t } #endif -static u64 div64_likely32(u64 divident, unsigned long divisor) -{ -#if BITS_PER_LONG == 32 - if (likely(divident <= 0xffffffffULL)) - return (u32)divident / divisor; - do_div(divident, divisor); - - return divident; -#else - return divident / divisor; -#endif -} - #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) #else @@ -698,16 +746,14 @@ calc_delta_fair(unsigned long delta_exec return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); } -static void update_load_add(struct load_weight *lw, unsigned long inc) +static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; - lw->inv_weight = 0; } -static void update_load_sub(struct load_weight *lw, unsigned long dec) +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; - lw->inv_weight = 0; } /* @@ -783,29 +829,20 @@ static int balance_tasks(struct rq *this int *this_best_prio, struct rq_iterator *iterator); #include "sched_stats.h" -#include "sched_rt.c" -#include "sched_fair.c" #include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif #define sched_class_highest (&rt_sched_class) -static void __update_curr_load(struct rq *rq, struct load_stat *ls) -{ - if (rq->curr != rq->idle && ls->load.weight) { - ls->delta_exec += ls->delta_stat; - ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); - ls->delta_stat = 0; - } -} - /* * Update delta_exec, delta_fair fields for rq. * * delta_fair clock advances at a rate inversely proportional to - * total load (rq->ls.load.weight) on the runqueue, while + * total load (rq->load.weight) on the runqueue, while * delta_exec advances at the same rate as wall-clock (provided * cpu is not idle). * @@ -813,35 +850,17 @@ static void __update_curr_load(struct rq * runqueue over any given interval. This (smoothened) load is used * during load balance. * - * This function is called /before/ updating rq->ls.load + * This function is called /before/ updating rq->load * and when switching tasks. */ -static void update_curr_load(struct rq *rq) -{ - struct load_stat *ls = &rq->ls; - u64 start; - - start = ls->load_update_start; - ls->load_update_start = rq->clock; - ls->delta_stat += rq->clock - start; - /* - * Stagger updates to ls->delta_fair. Very frequent updates - * can be expensive. - */ - if (ls->delta_stat >= sysctl_sched_stat_granularity) - __update_curr_load(rq, ls); -} - static inline void inc_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq); - update_load_add(&rq->ls.load, p->se.load.weight); + update_load_add(&rq->load, p->se.load.weight); } static inline void dec_load(struct rq *rq, const struct task_struct *p) { - update_curr_load(rq); - update_load_sub(&rq->ls.load, p->se.load.weight); + update_load_sub(&rq->load, p->se.load.weight); } static void inc_nr_running(struct task_struct *p, struct rq *rq) @@ -858,8 +877,6 @@ static void dec_nr_running(struct task_s static void set_load_weight(struct task_struct *p) { - p->se.wait_runtime = 0; - if (task_has_rt_policy(p)) { p->se.load.weight = prio_to_weight[0] * 2; p->se.load.inv_weight = prio_to_wmult[0] >> 1; @@ -951,20 +968,6 @@ static void activate_task(struct rq *rq, } /* - * activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void activate_idle_task(struct task_struct *p, struct rq *rq) -{ - update_rq_clock(rq); - - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, 0); - inc_nr_running(p, rq); -} - -/* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) @@ -988,15 +991,15 @@ inline int task_curr(const struct task_s /* Used instead of source_load when we know the type == 0 */ unsigned long weighted_cpuload(const int cpu) { - return cpu_rq(cpu)->ls.load.weight; + return cpu_rq(cpu)->load.weight; } static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { #ifdef CONFIG_SMP task_thread_info(p)->cpu = cpu; - set_task_cfs_rq(p); #endif + set_task_cfs_rq(p); } #ifdef CONFIG_SMP @@ -1005,15 +1008,11 @@ void set_task_cpu(struct task_struct *p, { int old_cpu = task_cpu(p); struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - u64 clock_offset, fair_clock_offset; + struct cfs_rq *old_cfsrq = task_cfs_rq(p), + *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); + u64 clock_offset; clock_offset = old_rq->clock - new_rq->clock; - fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; - - if (p->se.wait_start_fair) - p->se.wait_start_fair -= fair_clock_offset; - if (p->se.sleep_start_fair) - p->se.sleep_start_fair -= fair_clock_offset; #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) @@ -1023,6 +1022,8 @@ #ifdef CONFIG_SCHEDSTATS if (p->se.block_start) p->se.block_start -= clock_offset; #endif + p->se.vruntime -= old_cfsrq->min_vruntime - + new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -1077,69 +1078,71 @@ void wait_task_inactive(struct task_stru int running, on_rq; struct rq *rq; -repeat: - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) - cpu_relax(); + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) + cpu_relax(); - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - on_rq = p->se.on_rq; - task_rq_unlock(rq, &flags); + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + on_rq = p->se.on_rq; + task_rq_unlock(rq, &flags); - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - goto repeat; - } + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it wa still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(on_rq)) { - yield(); - goto repeat; - } + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + schedule_timeout_uninterruptible(1); + continue; + } - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } } /*** @@ -1173,7 +1176,7 @@ void kick_process(struct task_struct *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu, int type) +static unsigned long source_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1188,7 +1191,7 @@ static inline unsigned long source_load( * Return a high guess at the load of a migration-target cpu weighted * according to the scheduling class and "nice" value. */ -static inline unsigned long target_load(int cpu, int type) +static unsigned long target_load(int cpu, int type) { struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); @@ -1230,7 +1233,7 @@ find_idlest_group(struct sched_domain *s /* Skip over this group if it has no CPUs allowed */ if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - goto nextgroup; + continue; local_group = cpu_isset(this_cpu, group->cpumask); @@ -1258,9 +1261,7 @@ find_idlest_group(struct sched_domain *s min_load = avg_load; idlest = group; } -nextgroup: - group = group->next; - } while (group != sd->groups); + } while (group = group->next, group != sd->groups); if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -1451,7 +1452,7 @@ #ifdef CONFIG_SMP new_cpu = cpu; - schedstat_inc(rq, ttwu_cnt); + schedstat_inc(rq, ttwu_count); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); goto out_set_cpu; @@ -1583,28 +1584,20 @@ int fastcall wake_up_state(struct task_s */ static void __sched_fork(struct task_struct *p) { - p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; - p->se.delta_exec = 0; - p->se.delta_fair_run = 0; - p->se.delta_fair_sleep = 0; - p->se.wait_runtime = 0; - p->se.sleep_start_fair = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; - p->se.sum_wait_runtime = 0; p->se.sum_sleep_runtime = 0; p->se.sleep_start = 0; p->se.block_start = 0; p->se.sleep_max = 0; p->se.block_max = 0; p->se.exec_max = 0; + p->se.slice_max = 0; p->se.wait_max = 0; - p->se.wait_runtime_overruns = 0; - p->se.wait_runtime_underruns = 0; #endif INIT_LIST_HEAD(&p->run_list); @@ -1635,12 +1628,14 @@ void sched_fork(struct task_struct *p, i #ifdef CONFIG_SMP cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif - __set_task_cpu(p, cpu); + set_task_cpu(p, cpu); /* * Make sure we do not leak PI boosting priority to the child: */ p->prio = current->normal_prio; + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -1657,12 +1652,6 @@ #endif } /* - * After fork, child runs first. (default) If set to 0 then - * parent will (try to) run first. - */ -unsigned int __read_mostly sysctl_sched_child_runs_first = 1; - -/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -1673,24 +1662,14 @@ void fastcall wake_up_new_task(struct ta { unsigned long flags; struct rq *rq; - int this_cpu; rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); /* parent's CPU */ update_rq_clock(rq); p->prio = effective_prio(p); - if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; - - if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || - (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || - !current->se.on_rq) { - + if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { /* @@ -1799,7 +1778,7 @@ prepare_task_switch(struct rq *rq, struc * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; @@ -1981,42 +1960,10 @@ unsigned long nr_active(void) */ static void update_cpu_load(struct rq *this_rq) { - u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; - unsigned long total_load = this_rq->ls.load.weight; - unsigned long this_load = total_load; - struct load_stat *ls = &this_rq->ls; + unsigned long this_load = this_rq->load.weight; int i, scale; this_rq->nr_load_updates++; - if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) - goto do_avg; - - /* Update delta_fair/delta_exec fields first */ - update_curr_load(this_rq); - - fair_delta64 = ls->delta_fair + 1; - ls->delta_fair = 0; - - exec_delta64 = ls->delta_exec + 1; - ls->delta_exec = 0; - - sample_interval64 = this_rq->clock - ls->load_update_last; - ls->load_update_last = this_rq->clock; - - if ((s64)sample_interval64 < (s64)TICK_NSEC) - sample_interval64 = TICK_NSEC; - - if (exec_delta64 > sample_interval64) - exec_delta64 = sample_interval64; - - idle_delta64 = sample_interval64 - exec_delta64; - - tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); - tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); - - this_load = (unsigned long)tmp64; - -do_avg: /* Update our load: */ for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2026,7 +1973,13 @@ do_avg: old_load = this_rq->cpu_load[i]; new_load = this_load; - + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } } @@ -2263,7 +2216,7 @@ static int move_tasks(struct rq *this_rq struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { - struct sched_class *class = sched_class_highest; + const struct sched_class *class = sched_class_highest; unsigned long total_load_moved = 0; int this_best_prio = this_rq->curr->prio; @@ -2288,7 +2241,7 @@ static int move_tasks(struct rq *this_rq static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle) { - struct sched_class *class; + const struct sched_class *class; int this_best_prio = MAX_PRIO; for (class = sched_class_highest; class; class = class->next) @@ -2652,7 +2605,7 @@ static int load_balance(int this_cpu, st !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[idle]); + schedstat_inc(sd, lb_count[idle]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, @@ -2805,7 +2758,7 @@ load_balance_newidle(int this_cpu, struc !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); + schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, &cpus, NULL); @@ -2939,7 +2892,7 @@ static void active_load_balance(struct r } if (likely(sd)) { - schedstat_inc(sd, alb_cnt); + schedstat_inc(sd, alb_count); if (move_one_task(target_rq, target_cpu, busiest_rq, sd, CPU_IDLE)) @@ -3032,7 +2985,7 @@ static DEFINE_SPINLOCK(balancing); * * Balancing parameters are set up in arch_init_sched_domains. */ -static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); @@ -3429,7 +3382,13 @@ static inline void schedule_debug(struct profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_cnt); + schedstat_inc(this_rq(), sched_count); +#ifdef CONFIG_SCHEDSTATS + if (unlikely(prev->lock_depth >= 0)) { + schedstat_inc(this_rq(), bkl_count); + schedstat_inc(prev, sched_info.bkl_count); + } +#endif } /* @@ -3438,7 +3397,7 @@ static inline void schedule_debug(struct static inline struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev) { - struct sched_class *class; + const struct sched_class *class; struct task_struct *p; /* @@ -3487,9 +3446,13 @@ need_resched_nonpreemptible: schedule_debug(prev); - spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); + /* + * Do the rq-clock update outside the rq lock: + */ + local_irq_disable(); __update_rq_clock(rq); + spin_lock(&rq->lock); + clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3549,27 +3512,30 @@ #endif if (likely(ti->preempt_count || irqs_disabled())) return; -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - schedule(); + schedule(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } EXPORT_SYMBOL(preempt_schedule); @@ -3589,29 +3555,32 @@ #endif /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); -need_resched: - add_preempt_count(PREEMPT_ACTIVE); - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ + do { + add_preempt_count(PREEMPT_ACTIVE); + + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ #ifdef CONFIG_PREEMPT_BKL - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; #endif - local_irq_enable(); - schedule(); - local_irq_disable(); + local_irq_enable(); + schedule(); + local_irq_disable(); #ifdef CONFIG_PREEMPT_BKL - task->lock_depth = saved_lock_depth; + task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count(PREEMPT_ACTIVE); - /* we could miss a preemption opportunity between schedule and now */ - barrier(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); } #endif /* CONFIG_PREEMPT */ @@ -3635,10 +3604,9 @@ EXPORT_SYMBOL(default_wake_function); static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { - struct list_head *tmp, *next; + wait_queue_t *curr, *next; - list_for_each_safe(tmp, next, &q->task_list) { - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, sync, key) && @@ -3728,206 +3696,116 @@ void fastcall complete_all(struct comple } EXPORT_SYMBOL(complete_all); -void fastcall __sched wait_for_completion(struct completion *x) +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) { - might_sleep(); - - spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; - spin_unlock_irq(&x->wait.lock); -} -EXPORT_SYMBOL(wait_for_completion); - -unsigned long fastcall __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - __set_current_state(TASK_UNINTERRUPTIBLE); + if (state == TASK_INTERRUPTIBLE && + signal_pending(current)) { + __remove_wait_queue(&x->wait, &wait); + return -ERESTARTSYS; + } + __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); if (!timeout) { __remove_wait_queue(&x->wait, &wait); - goto out; + return timeout; } } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; -out: - spin_unlock_irq(&x->wait.lock); return timeout; } -EXPORT_SYMBOL(wait_for_completion_timeout); -int fastcall __sched wait_for_completion_interruptible(struct completion *x) +static long __sched +wait_for_common(struct completion *x, long timeout, int state) { - int ret = 0; - might_sleep(); spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - ret = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - schedule(); - spin_lock_irq(&x->wait.lock); - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: + timeout = do_wait_for_common(x, timeout, state); spin_unlock_irq(&x->wait.lock); + return timeout; +} - return ret; +void fastcall __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } -EXPORT_SYMBOL(wait_for_completion_interruptible); +EXPORT_SYMBOL(wait_for_completion); unsigned long fastcall __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) +wait_for_completion_timeout(struct completion *x, unsigned long timeout) { - might_sleep(); - - spin_lock_irq(&x->wait.lock); - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - wait.flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(&x->wait, &wait); - do { - if (signal_pending(current)) { - timeout = -ERESTARTSYS; - __remove_wait_queue(&x->wait, &wait); - goto out; - } - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - goto out; - } - } while (!x->done); - __remove_wait_queue(&x->wait, &wait); - } - x->done--; -out: - spin_unlock_irq(&x->wait.lock); - return timeout; + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); } -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +EXPORT_SYMBOL(wait_for_completion_timeout); -static inline void -sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +int __sched wait_for_completion_interruptible(struct completion *x) { - spin_lock_irqsave(&q->lock, *flags); - __add_wait_queue(q, wait); - spin_unlock(&q->lock); + return wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); } +EXPORT_SYMBOL(wait_for_completion_interruptible); -static inline void -sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) { - spin_lock_irq(&q->lock); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, *flags); + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); } +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); -void __sched interruptible_sleep_on(wait_queue_head_t *q) +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) { unsigned long flags; wait_queue_t wait; init_waitqueue_entry(&wait, current); - current->state = TASK_INTERRUPTIBLE; + __set_current_state(state); - sleep_on_head(q, &wait, &flags); - schedule(); - sleep_on_tail(q, &wait, &flags); + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(interruptible_sleep_on); long __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_INTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); - - return timeout; + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); } EXPORT_SYMBOL(interruptible_sleep_on_timeout); void __sched sleep_on(wait_queue_head_t *q) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_UNINTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - schedule(); - sleep_on_tail(q, &wait, &flags); + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } EXPORT_SYMBOL(sleep_on); long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - current->state = TASK_UNINTERRUPTIBLE; - - sleep_on_head(q, &wait, &flags); - timeout = schedule_timeout(timeout); - sleep_on_tail(q, &wait, &flags); - - return timeout; + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); } EXPORT_SYMBOL(sleep_on_timeout); @@ -3946,7 +3824,7 @@ #ifdef CONFIG_RT_MUTEXES void rt_mutex_setprio(struct task_struct *p, int prio) { unsigned long flags; - int oldprio, on_rq; + int oldprio, on_rq, running; struct rq *rq; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3956,8 +3834,12 @@ void rt_mutex_setprio(struct task_struct oldprio = p->prio; on_rq = p->se.on_rq; - if (on_rq) + running = task_running(rq, p); + if (on_rq) { dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3967,13 +3849,15 @@ void rt_mutex_setprio(struct task_struct p->prio = prio; if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -4137,7 +4021,7 @@ struct task_struct *idle_task(int cpu) * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ -static inline struct task_struct *find_process_by_pid(pid_t pid) +static struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; } @@ -4179,7 +4063,7 @@ __setscheduler(struct rq *rq, struct tas int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1, on_rq; + int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; struct rq *rq; @@ -4261,18 +4145,26 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - if (on_rq) + running = task_running(rq, p); + if (on_rq) { deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + } + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (on_rq) { + if (running) + p->sched_class->set_curr_task(rq); activate_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_running(rq, p)) { + if (running) { if (p->prio > oldprio) resched_task(rq->curr); } else { @@ -4343,10 +4235,10 @@ asmlinkage long sys_sched_setparam(pid_t asmlinkage long sys_sched_getscheduler(pid_t pid) { struct task_struct *p; - int retval = -EINVAL; + int retval; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4357,8 +4249,6 @@ asmlinkage long sys_sched_getscheduler(p retval = p->policy; } read_unlock(&tasklist_lock); - -out_nounlock: return retval; } @@ -4371,10 +4261,10 @@ asmlinkage long sys_sched_getparam(pid_t { struct sched_param lp; struct task_struct *p; - int retval = -EINVAL; + int retval; if (!param || pid < 0) - goto out_nounlock; + return -EINVAL; read_lock(&tasklist_lock); p = find_process_by_pid(pid); @@ -4394,7 +4284,6 @@ asmlinkage long sys_sched_getparam(pid_t */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -out_nounlock: return retval; out_unlock: @@ -4554,8 +4443,8 @@ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_cnt); - current->sched_class->yield_task(rq, current); + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); /* * Since we are going to call schedule() anyway, there's @@ -4749,11 +4638,12 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { struct task_struct *p; - int retval = -EINVAL; + unsigned int time_slice; + int retval; struct timespec t; if (pid < 0) - goto out_nounlock; + return -EINVAL; retval = -ESRCH; read_lock(&tasklist_lock); @@ -4765,12 +4655,24 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; - jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : static_prio_timeslice(p->static_prio), &t); + if (p->policy == SCHED_FIFO) + time_slice = 0; + else if (p->policy == SCHED_RR) + time_slice = DEF_TIMESLICE; + else { + struct sched_entity *se = &p->se; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + task_rq_unlock(rq, &flags); + } read_unlock(&tasklist_lock); + jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; -out_nounlock: return retval; + out_unlock: read_unlock(&tasklist_lock); return retval; @@ -4899,32 +4801,6 @@ #endif */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static inline void sched_init_granularity(void) -{ - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 100000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; - - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; - - sysctl_sched_runtime_limit = sysctl_sched_latency; - sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; -} - #ifdef CONFIG_SMP /* * This is how migration works: @@ -5102,35 +4978,34 @@ static void move_task_off_dead_cpu(int d struct rq *rq; int dest_cpu; -restart: - /* On same node? */ - mask = node_to_cpumask(cpu_to_node(dead_cpu)); - cpus_and(mask, mask, p->cpus_allowed); - dest_cpu = any_online_cpu(mask); - - /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) - dest_cpu = any_online_cpu(p->cpus_allowed); - - /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - rq = task_rq_lock(p, &flags); - cpus_setall(p->cpus_allowed); - dest_cpu = any_online_cpu(p->cpus_allowed); - task_rq_unlock(rq, &flags); + do { + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, p->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(p->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(p, &flags); + cpus_setall(p->cpus_allowed); + dest_cpu = any_online_cpu(p->cpus_allowed); + task_rq_unlock(rq, &flags); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - p->pid, p->comm, dead_cpu); - } - if (!__migrate_task(p, dead_cpu, dest_cpu)) - goto restart; + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + p->pid, p->comm, dead_cpu); + } + } while (!__migrate_task(p, dead_cpu, dest_cpu)); } /* @@ -5172,6 +5047,20 @@ static void migrate_live_tasks(int src_c } /* + * activate_idle_task - move idle task to the _front_ of runqueue. + */ +static void activate_idle_task(struct task_struct *p, struct rq *rq) +{ + update_rq_clock(rq); + + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, 0); + inc_nr_running(p, rq); +} + +/* * Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of the runqueue. Used by CPU offline code. @@ -5306,7 +5195,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(12); set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); @@ -5326,10 +5215,10 @@ sd_alloc_ctl_domain_table(struct sched_d sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[10], "cache_nice_tries", + set_table_entry(&table[9], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[12], "flags", &sd->flags, + set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); return table; @@ -5498,8 +5387,7 @@ #ifdef CONFIG_SMP int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); -#undef SCHED_DOMAIN_DEBUG -#ifdef SCHED_DOMAIN_DEBUG +#ifdef CONFIG_SCHED_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; @@ -5557,16 +5445,19 @@ static void sched_domain_debug(struct sc printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); + break; } if (!cpus_weight(group->cpumask)) { printk("\n"); printk(KERN_ERR "ERROR: empty group\n"); + break; } if (cpus_intersects(groupmask, group->cpumask)) { printk("\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; } cpus_or(groupmask, groupmask, group->cpumask); @@ -5700,7 +5591,7 @@ static int __init isolated_cpu_setup(cha return 1; } -__setup ("isolcpus=", isolated_cpu_setup); +__setup("isolcpus=", isolated_cpu_setup); /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer @@ -5929,24 +5820,23 @@ static void init_numa_sched_groups_power if (!sg) return; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; + do { + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); - } - sg = sg->next; - if (sg != group_head) - goto next_sg; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); + } + sg = sg->next; + } while (sg != group_head); } #endif @@ -6492,12 +6382,10 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); - sched_init_granularity(); } #else void __init sched_init_smp(void) { - sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -6511,28 +6399,20 @@ int in_sched_functions(unsigned long add && addr < (unsigned long)__sched_text_end); } -static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; - cfs_rq->fair_clock = 1; #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } void __init sched_init(void) { - u64 now = sched_clock(); int highest_cpu = 0; int i, j; - /* - * Link up the scheduling class hierarchy: - */ - rt_sched_class.next = &fair_sched_class; - fair_sched_class.next = &idle_sched_class; - idle_sched_class.next = NULL; - for_each_possible_cpu(i) { struct rt_prio_array *array; struct rq *rq; @@ -6545,10 +6425,28 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); #ifdef CONFIG_FAIR_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + { + struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); + struct sched_entity *se = + &per_cpu(init_sched_entity, i); + + init_cfs_rq_p[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = &init_task_group; + list_add(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + + init_sched_entity_p[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = init_task_group_load; + se->load.inv_weight = + div64_64(1ULL<<32, init_task_group_load); + se->parent = NULL; + } + init_task_group.shares = init_task_group_load; + spin_lock_init(&init_task_group.lock); #endif - rq->ls.load_update_last = now; - rq->ls.load_update_start = now; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -6633,26 +6531,40 @@ EXPORT_SYMBOL(__might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + int on_rq; + update_rq_clock(rq); + on_rq = p->se.on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } +} + void normalize_rt_tasks(void) { struct task_struct *g, *p; unsigned long flags; struct rq *rq; - int on_rq; read_lock_irq(&tasklist_lock); do_each_thread(g, p) { - p->se.fair_key = 0; - p->se.wait_runtime = 0; + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + p->se.exec_start = 0; - p->se.wait_start_fair = 0; - p->se.sleep_start_fair = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->cfs.fair_clock = 0; task_rq(p)->clock = 0; if (!rt_task(p)) { @@ -6667,26 +6579,9 @@ #endif spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); -#ifdef CONFIG_SMP - /* - * Do not touch the migration thread: - */ - if (p == rq->migration_thread) - goto out_unlock; -#endif - update_rq_clock(rq); - on_rq = p->se.on_rq; - if (on_rq) - deactivate_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); - if (on_rq) { - activate_task(rq, p, 0); - resched_task(rq->curr); - } -#ifdef CONFIG_SMP - out_unlock: -#endif + normalize_task(rq, p); + __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); } while_each_thread(g, p); @@ -6739,3 +6634,201 @@ void set_curr_task(int cpu, struct task_ } #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct rq *rq; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + if (!tg->se) + goto err; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, + cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, + cpu_to_node(i)); + if (!se) + goto err; + + memset(cfs_rq, 0, sizeof(struct cfs_rq)); + memset(se, 0, sizeof(struct sched_entity)); + + tg->cfs_rq[i] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + + tg->se[i] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = NICE_0_LOAD; + se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); + se->parent = NULL; + } + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = tg->cfs_rq[i]; + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + } + + tg->shares = NICE_0_LOAD; + spin_lock_init(&tg->lock); + + return tg; + +err: + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); + + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group(struct rcu_head *rhp) +{ + struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); + struct task_group *tg = cfs_rq->tg; + struct sched_entity *se; + int i; + + /* now it should be safe to free those cfs_rqs */ + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + kfree(cfs_rq); + + se = tg->se[i]; + kfree(se); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + struct cfs_rq *cfs_rq; + int i; + + for_each_possible_cpu(i) { + cfs_rq = tg->cfs_rq[i]; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + } + + cfs_rq = tg->cfs_rq[0]; + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&cfs_rq->rcu, free_sched_group); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + if (tsk->sched_class != &fair_sched_class) + goto done; + + update_rq_clock(rq); + + running = task_running(rq, tsk); + on_rq = tsk->se.on_rq; + + if (on_rq) { + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + } + + set_task_cfs_rq(tsk); + + if (on_rq) { + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + enqueue_task(rq, tsk, 0); + } + +done: + task_rq_unlock(rq, &flags); +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + int on_rq; + + spin_lock_irq(&rq->lock); + + on_rq = se->on_rq; + if (on_rq) + dequeue_entity(cfs_rq, se, 0); + + se->load.weight = shares; + se->load.inv_weight = div64_64((1ULL<<32), shares); + + if (on_rq) + enqueue_entity(cfs_rq, se, 0); + + spin_unlock_irq(&rq->lock); +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + + spin_lock(&tg->lock); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) + set_se_shares(tg->se[i], shares); + +done: + spin_unlock(&tg->lock); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index c3ee38b..0aab455 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -28,6 +28,31 @@ #define SEQ_printf(m, x...) \ printk(x); \ } while (0) +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(long long nsec) +{ + if (nsec < 0) { + nsec = -nsec; + do_div(nsec, 1000000); + return -nsec; + } + do_div(nsec, 1000000); + + return nsec; +} + +static unsigned long nsec_low(long long nsec) +{ + if (nsec < 0) + nsec = -nsec; + + return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq else SEQ_printf(m, " "); - SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", + SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", p->comm, p->pid, - (long long)p->se.fair_key, - (long long)(p->se.fair_key - rq->cfs.fair_clock), - (long long)p->se.wait_runtime, + SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", - (long long)p->se.sum_exec_runtime, - (long long)p->se.sum_wait_runtime, - (long long)p->se.sum_sleep_runtime, - (long long)p->se.wait_runtime_overruns, - (long long)p->se.wait_runtime_underruns); + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", + SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", - 0LL, 0LL, 0LL, 0LL, 0LL); + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif } @@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m, SEQ_printf(m, "\nrunnable tasks:\n" - " task PID tree-key delta waiting" - " switches prio" - " sum-exec sum-wait sum-sleep" - " wait-overrun wait-underrun\n" - "------------------------------------------------------------------" - "----------------" - "------------------------------------------------" - "--------------------------------\n"); + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" + "------------------------------------------------------" + "----------------------------------------------------\n"); read_lock_irq(&tasklist_lock); @@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m, read_unlock_irq(&tasklist_lock); } -static void -print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 wait_runtime_rq_sum = 0; - struct task_struct *p; - struct rb_node *curr; - unsigned long flags; + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; struct rq *rq = &per_cpu(runqueues, cpu); + struct sched_entity *last; + unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); - curr = first_fair(cfs_rq); - while (curr) { - p = rb_entry(curr, struct task_struct, se.run_node); - wait_runtime_rq_sum += p->se.wait_runtime; - - curr = rb_next(curr); - } - spin_unlock_irqrestore(&rq->lock, flags); - - SEQ_printf(m, " .%-30s: %Ld\n", "wait_runtime_rq_sum", - (long long)wait_runtime_rq_sum); -} - -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) -{ SEQ_printf(m, "\ncfs_rq\n"); -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) - - P(fair_clock); - P(exec_clock); - P(wait_runtime); - P(wait_runtime_overruns); - P(wait_runtime_underruns); - P(sleeper_bonus); -#undef P + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); - print_cfs_rq_runtime_sum(m, cpu, cfs_rq); + spin_lock_irqsave(&rq->lock, flags); + if (cfs_rq->rb_leftmost) + MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + min_vruntime = rq->cfs.min_vruntime; + rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; + spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", + rq->bkl_count); +#endif + SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", + cfs_rq->nr_spread_over); } static void print_cpu(struct seq_file *m, int cpu) @@ -141,31 +161,32 @@ #endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->ls.load.weight); - P(ls.delta_fair); - P(ls.delta_exec); + rq->load.weight); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies); - P(next_balance); + PN(next_balance); P(curr->pid); - P(clock); - P(idle_clock); - P(prev_clock_raw); + PN(clock); + PN(idle_clock); + PN(prev_clock_raw); P(clock_warps); P(clock_overflows); P(clock_deep_idle_events); - P(clock_max_delta); + PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); P(cpu_load[3]); P(cpu_load[4]); #undef P +#undef PN print_cfs_stats(m, cpu); @@ -182,7 +203,20 @@ static int sched_debug_show(struct seq_f (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); + SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); + +#define P(x) \ + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_nr_latency); + PN(sysctl_sched_wakeup_granularity); + PN(sysctl_sched_batch_wakeup_granularity); + PN(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P for_each_online_cpu(cpu) print_cpu(m, cpu); @@ -240,24 +274,23 @@ void proc_sched_show_task(struct task_st SEQ_printf(m, "----------------------------------------------\n"); #define P(F) \ SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) +#define PN(F) \ + SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) - P(se.wait_runtime); - P(se.wait_start_fair); - P(se.exec_start); - P(se.sleep_start_fair); - P(se.sum_exec_runtime); + PN(se.exec_start); + PN(se.vruntime); + PN(se.sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - P(se.wait_start); - P(se.sleep_start); - P(se.block_start); - P(se.sleep_max); - P(se.block_max); - P(se.exec_max); - P(se.wait_max); - P(se.wait_runtime_overruns); - P(se.wait_runtime_underruns); - P(se.sum_wait_runtime); + PN(se.wait_start); + PN(se.sleep_start); + PN(se.block_start); + PN(se.sleep_max); + PN(se.block_max); + PN(se.exec_max); + PN(se.slice_max); + PN(se.wait_max); + P(sched_info.bkl_count); #endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); @@ -265,6 +298,7 @@ #endif P(policy); P(prio); #undef P +#undef PN { u64 t0, t1; @@ -279,9 +313,13 @@ #undef P void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; - p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; + p->se.sleep_max = 0; + p->se.block_max = 0; + p->se.exec_max = 0; + p->se.slice_max = 0; + p->se.wait_max = 0; + p->sched_info.bkl_count = 0; #endif - p->se.sum_exec_runtime = 0; + p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a8..c240b72 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,22 +25,26 @@ * (default: 20ms, units: nanoseconds) * * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length. - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches field) + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. * - * On SMP systems the value of this is multiplied by the log2 of the - * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way - * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) - * Targeted preemption latency for CPU-bound tasks: + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; +const_debug unsigned int sysctl_sched_latency = 20000000ULL; + +/* + * After fork, child runs first. (default) If set to 0 then + * parent will (try to) run first. + */ +const_debug unsigned int sysctl_sched_child_runs_first = 1; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; +const_debug unsigned int sysctl_sched_nr_latency = 20; /* * sys_sched_yield() compat mode @@ -52,52 +56,23 @@ unsigned int __read_mostly sysctl_sched_ /* * SCHED_BATCH wake-up granularity. - * (default: 25 msec, units: nanoseconds) + * (default: 10 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; +const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec, units: nanoseconds) + * (default: 10 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; - -unsigned int sysctl_sched_stat_granularity __read_mostly; - -/* - * Initialized in sched_init_granularity() [to 5 times the base granularity]: - */ -unsigned int sysctl_sched_runtime_limit __read_mostly; - -/* - * Debugging: various feature bits - */ -enum { - SCHED_FEAT_FAIR_SLEEPERS = 1, - SCHED_FEAT_SLEEPER_AVG = 2, - SCHED_FEAT_SLEEPER_LOAD_AVG = 4, - SCHED_FEAT_PRECISE_CPU_LOAD = 8, - SCHED_FEAT_START_DEBIT = 16, - SCHED_FEAT_SKIP_INITIAL = 32, -}; - -unsigned int sysctl_sched_features __read_mostly = - SCHED_FEAT_FAIR_SLEEPERS *1 | - SCHED_FEAT_SLEEPER_AVG *0 | - SCHED_FEAT_SLEEPER_LOAD_AVG *1 | - SCHED_FEAT_PRECISE_CPU_LOAD *1 | - SCHED_FEAT_START_DEBIT *1 | - SCHED_FEAT_SKIP_INITIAL *0; - -extern struct sched_class fair_sched_class; +const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; /************************************************************** * CFS operations on generic schedulable entities: @@ -111,21 +86,9 @@ static inline struct rq *rq_of(struct cf return cfs_rq->rq; } -/* currently running entity (if any) on this cfs_rq */ -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - return cfs_rq->curr; -} - /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - cfs_rq->curr = se; -} - #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -133,21 +96,8 @@ static inline struct rq *rq_of(struct cf return container_of(cfs_rq, struct rq, cfs); } -static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); - - if (unlikely(rq->curr->sched_class != &fair_sched_class)) - return NULL; - - return &rq->curr->se; -} - #define entity_is_task(se) 1 -static inline void -set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline struct task_struct *task_of(struct sched_entity *se) @@ -160,16 +110,38 @@ static inline struct task_struct *task_o * Scheduling class tree data structure manipulation methods: */ +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return se->vruntime - cfs_rq->min_vruntime; +} + /* * Enqueue an entity into the rb-tree: */ -static inline void -__enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = se->fair_key; + s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* @@ -182,7 +154,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, * We dont care about collisions. Nodes with * the same key stay together. */ - if (key - entry->fair_key < 0) { + if (key < entity_key(cfs_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; @@ -199,24 +171,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); - update_load_add(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running++; - se->on_rq = 1; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } -static inline void -__dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) cfs_rq->rb_leftmost = rb_next(&se->run_node); - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - update_load_sub(&cfs_rq->load, se->load.weight); - cfs_rq->nr_running--; - se->on_rq = 0; - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -229,118 +191,86 @@ static struct sched_entity *__pick_next_ return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); } +static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *se = NULL; + struct rb_node *parent; + + while (*link) { + parent = *link; + se = rb_entry(parent, struct sched_entity, run_node); + link = &parent->rb_right; + } + + return se; +} + /************************************************************** * Scheduling class statistics methods: */ + /* - * Calculate the preemption granularity needed to schedule every - * runnable task once per sysctl_sched_latency amount of time. - * (down to a sensible low limit on granularity) - * - * For example, if there are 2 tasks running and latency is 10 msecs, - * we switch tasks every 5 msecs. If we have 3 tasks running, we have - * to switch tasks every 3.33 msecs to get a 10 msecs observed latency - * for each task. We do finer and finer scheduling up to until we - * reach the minimum granularity value. - * - * To achieve this we use the following dynamic-granularity rule: - * - * gran = lat/nr - lat/nr/nr + * The idea is to set a period in which each task runs once. * - * This comes out of the following equations: + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. * - * kA1 + gran = kB1 - * kB2 + gran = kA2 - * kA2 = kA1 - * kB2 = kB1 - d + d/nr - * lat = d * nr - * - * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), - * '1' is start of time, '2' is end of time, 'd' is delay between - * 1 and 2 (during which task B was running), 'nr' is number of tasks - * running, 'lat' is the the period of each task. ('lat' is the - * sched_latency that we aim for.) + * p = (nr <= nl) ? l : l*nr/nl */ -static long -sched_granularity(struct cfs_rq *cfs_rq) +static u64 __sched_period(unsigned long nr_running) { - unsigned int gran = sysctl_sched_latency; - unsigned int nr = cfs_rq->nr_running; + u64 period = sysctl_sched_latency; + unsigned long nr_latency = sysctl_sched_nr_latency; - if (nr > 1) { - gran = gran/nr - gran/nr/nr; - gran = max(gran, sysctl_sched_min_granularity); + if (unlikely(nr_running > nr_latency)) { + period *= nr_running; + do_div(period, nr_latency); } - return gran; + return period; } /* - * We rescale the rescheduling granularity of tasks according to their - * nice level, but only linearly, not exponentially: + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*w/rw */ -static long -niced_granularity(struct sched_entity *curr, unsigned long granularity) +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 tmp; + u64 slice = __sched_period(cfs_rq->nr_running); - if (likely(curr->load.weight == NICE_0_LOAD)) - return granularity; - /* - * Positive nice levels get the same granularity as nice-0: - */ - if (likely(curr->load.weight < NICE_0_LOAD)) { - tmp = curr->load.weight * (u64)granularity; - return (long) (tmp >> NICE_0_SHIFT); - } - /* - * Negative nice level tasks get linearly finer - * granularity: - */ - tmp = curr->load.inv_weight * (u64)granularity; + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); - /* - * It will always fit into 'long': - */ - return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); + return slice; } -static inline void -limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) +/* + * We calculate the vruntime slice. + * + * vs = s/w = p/rw + */ +static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) { - long limit = sysctl_sched_runtime_limit; + u64 vslice = __sched_period(nr_running); - /* - * Niced tasks have the same history dynamic range as - * non-niced tasks: - */ - if (unlikely(se->wait_runtime > limit)) { - se->wait_runtime = limit; - schedstat_inc(se, wait_runtime_overruns); - schedstat_inc(cfs_rq, wait_runtime_overruns); - } - if (unlikely(se->wait_runtime < -limit)) { - se->wait_runtime = -limit; - schedstat_inc(se, wait_runtime_underruns); - schedstat_inc(cfs_rq, wait_runtime_underruns); - } + do_div(vslice, rq_weight); + + return vslice; } -static inline void -__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +static u64 sched_vslice(struct cfs_rq *cfs_rq) { - se->wait_runtime += delta; - schedstat_add(se, sum_wait_runtime, delta); - limit_wait_runtime(cfs_rq, se); + return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); } -static void -add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); - __add_wait_runtime(cfs_rq, se, delta); - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); + return __sched_vslice(cfs_rq->load.weight + se->load.weight, + cfs_rq->nr_running + 1); } /* @@ -348,46 +278,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, * are not in our scheduling class. */ static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) { - unsigned long delta, delta_exec, delta_fair, delta_mine; - struct load_weight *lw = &cfs_rq->load; - unsigned long load = lw->weight; + unsigned long delta_exec_weighted; + u64 vruntime; - delta_exec = curr->delta_exec; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; - cfs_rq->exec_clock += delta_exec; - - if (unlikely(!load)) - return; - - delta_fair = calc_delta_fair(delta_exec, lw); - delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - - if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { - delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); - delta = min(delta, (unsigned long)( - (long)sysctl_sched_runtime_limit - curr->wait_runtime)); - cfs_rq->sleeper_bonus -= delta; - delta_mine -= delta; + schedstat_add(cfs_rq, exec_clock, delta_exec); + delta_exec_weighted = delta_exec; + if (unlikely(curr->load.weight != NICE_0_LOAD)) { + delta_exec_weighted = calc_delta_fair(delta_exec_weighted, + &curr->load); } + curr->vruntime += delta_exec_weighted; - cfs_rq->fair_clock += delta_fair; /* - * We executed delta_exec amount of time on the CPU, - * but we were only entitled to delta_mine amount of - * time during that period (if nr_running == 1 then - * the two values are equal) - * [Note: delta_mine - delta_exec is negative]: + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. */ - add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(curr->vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = curr->vruntime; + + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) { - struct sched_entity *curr = cfs_rq_curr(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; if (unlikely(!curr)) @@ -398,135 +323,47 @@ static void update_curr(struct cfs_rq *c * since the last time we changed load (this cannot * overflow on 32 bits): */ - delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); - - curr->delta_exec += delta_exec; + delta_exec = (unsigned long)(now - curr->exec_start); - if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { - __update_curr(cfs_rq, curr); - curr->delta_exec = 0; - } - curr->exec_start = rq_of(cfs_rq)->clock; + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; } static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->wait_start_fair = cfs_rq->fair_clock; schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } /* - * We calculate fair deltas here, so protect against the random effects - * of a multiplication overflow by capping it to the runtime limit: - */ -#if BITS_PER_LONG == 32 -static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) -{ - u64 tmp = (u64)delta * weight >> shift; - - if (unlikely(tmp > sysctl_sched_runtime_limit*2)) - return sysctl_sched_runtime_limit*2; - return tmp; -} -#else -static inline unsigned long -calc_weighted(unsigned long delta, unsigned long weight, int shift) -{ - return delta * weight >> shift; -} -#endif - -/* * Task is being enqueued - update stats: */ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 key; - /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ - if (se != cfs_rq_curr(cfs_rq)) + if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se); - /* - * Update the key: - */ - key = cfs_rq->fair_clock; - - /* - * Optimize the common nice 0 case: - */ - if (likely(se->load.weight == NICE_0_LOAD)) { - key -= se->wait_runtime; - } else { - u64 tmp; - - if (se->wait_runtime < 0) { - tmp = -se->wait_runtime; - key += (tmp * se->load.inv_weight) >> - (WMULT_SHIFT - NICE_0_SHIFT); - } else { - tmp = se->wait_runtime; - key -= (tmp * se->load.inv_weight) >> - (WMULT_SHIFT - NICE_0_SHIFT); - } - } - - se->fair_key = key; -} - -/* - * Note: must be called with a freshly updated rq->fair_clock. - */ -static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - unsigned long delta_fair = se->delta_fair_run; - - schedstat_set(se->wait_max, max(se->wait_max, - rq_of(cfs_rq)->clock - se->wait_start)); - - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); - - add_wait_runtime(cfs_rq, se, delta_fair); } static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long delta_fair; - - if (unlikely(!se->wait_start_fair)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->wait_start_fair)); - - se->delta_fair_run += delta_fair; - if (unlikely(abs(se->delta_fair_run) >= - sysctl_sched_stat_granularity)) { - __update_stats_wait_end(cfs_rq, se); - se->delta_fair_run = 0; - } - - se->wait_start_fair = 0; + schedstat_set(se->wait_max, max(se->wait_max, + rq_of(cfs_rq)->clock - se->wait_start)); schedstat_set(se->wait_start, 0); } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_curr(cfs_rq); /* * Mark the end of the wait period if dequeueing a * waiting task: */ - if (se != cfs_rq_curr(cfs_rq)) + if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); } @@ -555,66 +392,24 @@ update_stats_curr_end(struct cfs_rq *cfs * Scheduling class queueing methods: */ -static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long load = cfs_rq->load.weight, delta_fair; - long prev_runtime; - - /* - * Do not boost sleepers if there's too much bonus 'in flight' - * already: - */ - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - return; - - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) - load = rq_of(cfs_rq)->cpu_load[2]; - - delta_fair = se->delta_fair_sleep; - - /* - * Fix up delta_fair with the effect of us running - * during the whole sleep period: - */ - if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) - delta_fair = div64_likely32((u64)delta_fair * load, - load + se->load.weight); - - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta_fair = calc_weighted(delta_fair, se->load.weight, - NICE_0_SHIFT); - - prev_runtime = se->wait_runtime; - __add_wait_runtime(cfs_rq, se, delta_fair); - delta_fair = se->wait_runtime - prev_runtime; + update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running++; + se->on_rq = 1; +} - /* - * Track the amount of bonus we've given to sleepers: - */ - cfs_rq->sleeper_bonus += delta_fair; +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + cfs_rq->nr_running--; + se->on_rq = 0; } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *tsk = task_of(se); - unsigned long delta_fair; - - if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || - !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) - return; - - delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), - (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); - - se->delta_fair_sleep += delta_fair; - if (unlikely(abs(se->delta_fair_sleep) >= - sysctl_sched_stat_granularity)) { - __enqueue_sleeper(cfs_rq, se); - se->delta_fair_sleep = 0; - } - - se->sleep_start_fair = 0; - #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; @@ -646,6 +441,8 @@ #ifdef CONFIG_SCHEDSTATS * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { + struct task_struct *tsk = task_of(se); + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } @@ -653,27 +450,81 @@ #ifdef CONFIG_SCHEDSTATS #endif } +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + u64 vruntime; + + vruntime = cfs_rq->min_vruntime; + + if (sched_feat(TREE_AVG)) { + struct sched_entity *last = __pick_last_entity(cfs_rq); + if (last) { + vruntime += last->vruntime; + vruntime >>= 1; + } + } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) + vruntime += sched_vslice(cfs_rq)/2; + + if (initial && sched_feat(START_DEBIT)) + vruntime += sched_vslice_add(cfs_rq, se); + + if (!initial) { + if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && + task_of(se)->policy != SCHED_BATCH) + vruntime -= sysctl_sched_latency; + + vruntime = max_t(s64, vruntime, se->vruntime); + } + + se->vruntime = vruntime; + +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* - * Update the fair clock. + * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - if (wakeup) + if (wakeup) { + place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); + } update_stats_enqueue(cfs_rq, se); - __enqueue_entity(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); } static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + update_stats_dequeue(cfs_rq, se); if (sleep) { - se->sleep_start_fair = cfs_rq->fair_clock; + se->peer_preempt = 0; #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -685,68 +536,66 @@ #ifdef CONFIG_SCHEDSTATS } #endif } - __dequeue_entity(cfs_rq, se); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); } /* * Preempt the current task with a newly woken task if needed: */ static void -__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, - struct sched_entity *curr, unsigned long granularity) +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - s64 __delta = curr->fair_key - se->fair_key; unsigned long ideal_runtime, delta_exec; - /* - * ideal_runtime is compared against sum_exec_runtime, which is - * walltime, hence do not scale. - */ - ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, - (unsigned long)sysctl_sched_min_granularity); - - /* - * If we executed more than what the latency constraint suggests, - * reduce the rescheduling granularity. This way the total latency - * of how much a task is not scheduled converges to - * sysctl_sched_latency: - */ + ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) - granularity = 0; - - /* - * Take scheduling granularity into account - do not - * preempt the current task unless the best task has - * a larger than sched_granularity fairness advantage: - * - * scale granularity as key space is in fair_clock. - */ - if (__delta > niced_granularity(curr, granularity)) + if (delta_exec > ideal_runtime || + (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) resched_task(rq_of(cfs_rq)->curr); + curr->peer_preempt = 0; } -static inline void +static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + } + + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; +#ifdef CONFIG_SCHEDSTATS /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. (note, here we rely on pick_next_task() having - * done a put_prev_task_fair() shortly before this, which - * updated rq->fair_clock - used by update_stats_wait_end()) + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): */ - update_stats_wait_end(cfs_rq, se); - update_stats_curr_start(cfs_rq, se); - set_cfs_rq_curr(cfs_rq, se); + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + se->slice_max = max(se->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime); + } +#endif se->prev_sum_exec_runtime = se->sum_exec_runtime; } static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = NULL; - set_next_entity(cfs_rq, se); + if (first_fair(cfs_rq)) { + se = __pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + } return se; } @@ -762,31 +611,24 @@ static void put_prev_entity(struct cfs_r update_stats_curr_end(cfs_rq, prev); - if (prev->on_rq) + check_spread(cfs_rq, prev); + if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); - set_cfs_rq_curr(cfs_rq, NULL); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + } + cfs_rq->curr = NULL; } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *next; - /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ - dequeue_entity(cfs_rq, curr, 0); - enqueue_entity(cfs_rq, curr, 0); - - /* - * Reschedule if another task tops the current one. - */ - next = __pick_next_entity(cfs_rq); - if (next == curr) - return; + update_curr(cfs_rq); - __check_preempt_curr_fair(cfs_rq, next, curr, - sched_granularity(cfs_rq)); + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + check_preempt_tick(cfs_rq, curr); } /************************************************** @@ -821,23 +663,28 @@ static inline struct cfs_rq *group_cfs_r */ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) { - /* A later patch will take group into account */ - return &cpu_rq(this_cpu)->cfs; + return cfs_rq->tg->cfs_rq[this_cpu]; } /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) -/* Do the two (enqueued) tasks belong to the same group ? */ -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { - if (curr->se.cfs_rq == p->se.cfs_rq) + if (se->cfs_rq == pse->cfs_rq) return 1; return 0; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -870,11 +717,17 @@ static inline struct cfs_rq *cpu_cfs_rq( #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int is_same_group(struct task_struct *curr, struct task_struct *p) +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) { return 1; } +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -892,6 +745,7 @@ static void enqueue_task_fair(struct rq break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); + wakeup = 1; } } @@ -911,6 +765,7 @@ static void dequeue_task_fair(struct rq /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; + sleep = 1; } } @@ -919,12 +774,10 @@ static void dequeue_task_fair(struct rq * * If compat_yield is turned on then we requeue to the end of the tree. */ -static void yield_task_fair(struct rq *rq, struct task_struct *p) +static void yield_task_fair(struct rq *rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *rightmost, *se = &p->se; - struct rb_node *parent; + struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); + struct sched_entity *rightmost, *se = &rq->curr->se; /* * Are we the only task in the tree? @@ -935,52 +788,39 @@ static void yield_task_fair(struct rq *r if (likely(!sysctl_sched_compat_yield)) { __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Update run-time statistics of the 'current'. */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + update_curr(cfs_rq); return; } /* * Find the rightmost entry in the rbtree: */ - do { - parent = *link; - link = &parent->rb_right; - } while (*link); - - rightmost = rb_entry(parent, struct sched_entity, run_node); + rightmost = __pick_last_entity(cfs_rq); /* * Already in the rightmost position? */ - if (unlikely(rightmost == se)) + if (unlikely(rightmost->vruntime < se->vruntime)) return; /* * Minimally necessary key value to be last in the tree: + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. */ - se->fair_key = rightmost->fair_key + 1; - - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); - /* - * Relink the task to the rightmost position: - */ - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + se->vruntime = rightmost->vruntime + 1; } /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - unsigned long gran; + struct sched_entity *se = &curr->se, *pse = &p->se; + s64 delta, gran; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -988,16 +828,31 @@ static void check_preempt_curr_fair(stru resched_task(curr); return; } - - gran = sysctl_sched_wakeup_granularity; /* - * Batch tasks prefer throughput over latency: + * Batch tasks do not preempt (their preemption is driven by + * the tick): */ if (unlikely(p->policy == SCHED_BATCH)) - gran = sysctl_sched_batch_wakeup_granularity; + return; + + if (sched_feat(WAKEUP_PREEMPT)) { + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } + + delta = se->vruntime - pse->vruntime; + gran = sysctl_sched_wakeup_granularity; + if (unlikely(se->load.weight != NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); - if (is_same_group(curr, p)) - __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); + if (delta > gran) { + int now = !sched_feat(PREEMPT_RESTRICT); + + if (now || p->prio < curr->prio || !se->peer_preempt++) + resched_task(curr); + } + } } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1041,7 +896,7 @@ static void put_prev_task_fair(struct rq * achieve that by always pre-iterating before returning * the current task: */ -static inline struct task_struct * +static struct task_struct * __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) { struct task_struct *p; @@ -1078,7 +933,10 @@ static int cfs_rq_best_prio(struct cfs_r if (!cfs_rq->nr_running) return MAX_PRIO; - curr = __pick_next_entity(cfs_rq); + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + p = task_of(curr); return p->prio; @@ -1153,6 +1011,8 @@ static void task_tick_fair(struct rq *rq } } +#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) + /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks @@ -1163,37 +1023,32 @@ static void task_tick_fair(struct rq *rq static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); + struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + int this_cpu = smp_processor_id(); sched_info_queued(p); update_curr(cfs_rq); - update_stats_enqueue(cfs_rq, se); - /* - * Child runs first: we let it run before the parent - * until it reschedules once. We set up the key so that - * it will preempt the parent: - */ - se->fair_key = curr->fair_key - - niced_granularity(curr, sched_granularity(cfs_rq)) - 1; - /* - * The first wait is dominated by the child-runs-first logic, - * so do not credit it with that waiting time yet: - */ - if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - se->wait_start_fair = 0; + place_entity(cfs_rq, se, 1); - /* - * The statistical average of wait_runtime is about - * -granularity/2, so initialize the task with that: - */ - if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - se->wait_runtime = -(sched_granularity(cfs_rq) / 2); + if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && + curr->vruntime < se->vruntime) { + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); + } + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + check_spread(cfs_rq, curr); __enqueue_entity(cfs_rq, se); + account_entity_enqueue(cfs_rq, se); + se->peer_preempt = 0; + resched_task(rq->curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task @@ -1206,21 +1061,17 @@ static void set_curr_task_fair(struct rq for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); } -#else -static void set_curr_task_fair(struct rq *rq) -{ -} -#endif /* * All the scheduling class methods: */ -struct sched_class fair_sched_class __read_mostly = { +static const struct sched_class fair_sched_class = { + .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, - .check_preempt_curr = check_preempt_curr_fair, + .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, @@ -1237,6 +1088,9 @@ static void print_cfs_stats(struct seq_f { struct cfs_rq *cfs_rq; +#ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); +#endif for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); } diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3503fb2..6e2ead4 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -50,10 +50,15 @@ static void task_tick_idle(struct rq *rq { } +static void set_curr_task_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ -static struct sched_class idle_sched_class __read_mostly = { +const struct sched_class idle_sched_class = { + /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -66,6 +71,7 @@ static struct sched_class idle_sched_cla .load_balance = load_balance_idle, + .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 4b87476..d0097a0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -7,7 +7,7 @@ * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ -static inline void update_curr_rt(struct rq *rq) +static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *r } static void -yield_task_rt(struct rq *rq, struct task_struct *p) +yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, p); + requeue_task_rt(rq, rq->curr); } /* @@ -206,7 +206,7 @@ static void task_tick_rt(struct rq *rq, if (--p->time_slice) return; - p->time_slice = static_prio_timeslice(p->static_prio); + p->time_slice = DEF_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -218,7 +218,15 @@ static void task_tick_rt(struct rq *rq, } } -static struct sched_class rt_sched_class __read_mostly = { +static void set_curr_task_rt(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock; +} + +const struct sched_class rt_sched_class = { + .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -230,5 +238,6 @@ static struct sched_class rt_sched_class .load_balance = load_balance_rt, + .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c20a94d..1c08484 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -16,18 +16,18 @@ static int show_schedstat(struct seq_fil struct rq *rq = cpu_rq(cpu); #ifdef CONFIG_SMP struct sched_domain *sd; - int dcnt = 0; + int dcount = 0; #endif /* runqueue-specific stats */ seq_printf(seq, "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu", cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); seq_printf(seq, "\n"); @@ -39,12 +39,12 @@ #ifdef CONFIG_SMP char mask_str[NR_CPUS]; cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); + seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " "%lu", - sd->lb_cnt[itype], + sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], sd->lb_imbalance[itype], @@ -55,9 +55,9 @@ #ifdef CONFIG_SMP } seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } @@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsi { if (rq) { rq->rq_sched_info.run_delay += delta; - rq->rq_sched_info.pcnt++; + rq->rq_sched_info.pcount++; } } @@ -129,7 +129,7 @@ # define schedstat_add(rq, field, amt) d # define schedstat_set(var, val) do { } while (0) #endif -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHEDSTATS /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -164,7 +164,7 @@ static void sched_info_arrive(struct tas sched_info_dequeued(t); t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; - t->sched_info.pcnt++; + t->sched_info.pcount++; rq_sched_info_arrive(task_rq(t), delta); } @@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *pr #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +#endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53a456e..c278838 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -222,14 +222,11 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_granularity_ns", - .data = &sysctl_sched_min_granularity, + .procname = "sched_nr_latency", + .data = &sysctl_sched_nr_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, + .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, @@ -266,28 +263,6 @@ #ifdef CONFIG_SCHED_DEBUG }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_stat_granularity_ns", - .data = &sysctl_sched_stat_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_runtime_limit_ns", - .data = &sysctl_sched_runtime_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_sched_granularity_ns, - .extra2 = &max_sched_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), diff --git a/kernel/user.c b/kernel/user.c index 9ca2848..f0e561e 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,12 +50,16 @@ #ifdef CONFIG_KEYS .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif +#ifdef CONFIG_FAIR_USER_SCHED + .tg = &init_task_group, +#endif }; /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) +static inline void uid_hash_insert(struct user_struct *up, + struct hlist_head *hashent) { hlist_add_head(&up->uidhash_node, hashent); } @@ -65,13 +69,14 @@ static inline void uid_hash_remove(struc hlist_del_init(&up->uidhash_node); } -static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) +static inline struct user_struct *uid_hash_find(uid_t uid, + struct hlist_head *hashent) { struct user_struct *user; struct hlist_node *h; hlist_for_each_entry(user, h, hashent, uidhash_node) { - if(user->uid == uid) { + if (user->uid == uid) { atomic_inc(&user->__count); return user; } @@ -80,6 +85,203 @@ static inline struct user_struct *uid_ha return NULL; } +#ifdef CONFIG_FAIR_USER_SCHED + +static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ +static DEFINE_MUTEX(uids_mutex); + +static void sched_destroy_user(struct user_struct *up) +{ + sched_destroy_group(up->tg); +} + +static int sched_create_user(struct user_struct *up) +{ + int rc = 0; + + up->tg = sched_create_group(); + if (IS_ERR(up->tg)) + rc = -ENOMEM; + + return rc; +} + +static void sched_switch_user(struct task_struct *p) +{ + sched_move_task(p); +} + +static inline void uids_mutex_lock(void) +{ + mutex_lock(&uids_mutex); +} + +static inline void uids_mutex_unlock(void) +{ + mutex_unlock(&uids_mutex); +} + +/* return cpu shares held by the user */ +ssize_t cpu_shares_show(struct kset *kset, char *buffer) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + + return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); +} + +/* modify cpu shares held by the user */ +ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + unsigned long shares; + int rc; + + sscanf(buffer, "%lu", &shares); + + rc = sched_group_set_shares(up->tg, shares); + + return (rc ? rc : size); +} + +static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) +{ + sa->attr.name = name; + sa->attr.mode = mode; + sa->show = cpu_shares_show; + sa->store = cpu_shares_store; +} + +/* Create "/sys/kernel/uids/" directory and + * "/sys/kernel/uids//cpu_share" file for this user. + */ +static int user_kobject_create(struct user_struct *up) +{ + struct kset *kset = &up->kset; + struct kobject *kobj = &kset->kobj; + int error; + + memset(kset, 0, sizeof(struct kset)); + kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ + kobject_set_name(kobj, "%d", up->uid); + kset_init(kset); + user_attr_init(&up->user_attr, "cpu_share", 0644); + + error = kobject_add(kobj); + if (error) + goto done; + + error = sysfs_create_file(kobj, &up->user_attr.attr); + if (error) + kobject_del(kobj); + + kobject_uevent(kobj, KOBJ_ADD); + +done: + return error; +} + +/* create these in sysfs filesystem: + * "/sys/kernel/uids" directory + * "/sys/kernel/uids/0" directory (for root user) + * "/sys/kernel/uids/0/cpu_share" file (for root user) + */ +int __init uids_kobject_init(void) +{ + int error; + + /* create under /sys/kernel dir */ + uids_kobject.parent = &kernel_subsys.kobj; + uids_kobject.kset = &kernel_subsys; + kobject_set_name(&uids_kobject, "uids"); + kobject_init(&uids_kobject); + + error = kobject_add(&uids_kobject); + if (!error) + error = user_kobject_create(&root_user); + + return error; +} + +/* work function to remove sysfs directory for a user and free up + * corresponding structures. + */ +static void remove_user_sysfs_dir(struct work_struct *w) +{ + struct user_struct *up = container_of(w, struct user_struct, work); + struct kobject *kobj = &up->kset.kobj; + unsigned long flags; + int remove_user = 0; + + /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() + * atomic. + */ + uids_mutex_lock(); + + local_irq_save(flags); + + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + remove_user = 1; + spin_unlock_irqrestore(&uidhash_lock, flags); + } else { + local_irq_restore(flags); + } + + if (!remove_user) + goto done; + + sysfs_remove_file(kobj, &up->user_attr.attr); + kobject_uevent(kobj, KOBJ_REMOVE); + kobject_del(kobj); + + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); + +done: + uids_mutex_unlock(); +} + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + /* restore back the count */ + atomic_inc(&up->__count); + spin_unlock_irqrestore(&uidhash_lock, flags); + + INIT_WORK(&up->work, remove_user_sysfs_dir); + schedule_work(&up->work); +} + +#else /* CONFIG_FAIR_USER_SCHED */ + +static void sched_destroy_user(struct user_struct *up) { } +static int sched_create_user(struct user_struct *up) { return 0; } +static void sched_switch_user(struct task_struct *p) { } +static inline int user_kobject_create(struct user_struct *up) { return 0; } +static inline void uids_mutex_lock(void) { } +static inline void uids_mutex_unlock(void) { } + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); +} + +#endif /* CONFIG_FAIR_USER_SCHED */ + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -106,15 +308,10 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - } else { + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else local_irq_restore(flags); - } } struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) @@ -122,6 +319,11 @@ struct user_struct * alloc_uid(struct us struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; + /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() + * atomic. + */ + uids_mutex_lock(); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -150,6 +352,22 @@ #endif return NULL; } + if (sched_create_user(new) < 0) { + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + return NULL; + } + + if (user_kobject_create(new)) { + sched_destroy_user(new); + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + uids_mutex_unlock(); + return NULL; + } + /* * Before adding this, check whether we raced * on adding the same user already.. @@ -157,6 +375,11 @@ #endif spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + /* This case is not possible when CONFIG_FAIR_USER_SCHED + * is defined, since we serialize alloc_uid() using + * uids_mutex. Hence no need to call + * sched_destroy_user() or remove_user_sysfs_dir(). + */ key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -167,6 +390,9 @@ #endif spin_unlock_irq(&uidhash_lock); } + + uids_mutex_unlock(); + return up; } @@ -184,6 +410,7 @@ void switch_uid(struct user_struct *new_ atomic_dec(&old_user->processes); switch_uid_keyring(new_user); current->user = new_user; + sched_switch_user(current); /* * We need to synchronize with __sigqueue_alloc()