GIT 3d2d94ca01455fc4f220fb8a054c3b9bf8a72f8c git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git

commit 3d2d94ca01455fc4f220fb8a054c3b9bf8a72f8c
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: group scheduler, fix latency
    
    There is a possibility that because of task of a group moving from one
    cpu to another, it may gain more cpu time that desired. See
    http://marc.info/?l=linux-kernel&m=119073197730334 for details.
    
    This is an attempt to fix that problem. Basically it simulates dequeue
    of higher level entities as if they are going to sleep. Similarly it
    simulate wakeup of higher level entities as if they are waking up from
    sleep.
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 8cbfc4160132b0560c5a1e0302dd06104e38d4cc
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: group scheduler, fix bloat
    
    Recent fix to check_preempt_wakeup() to check for preemption at higher
    levels caused a size bloat for !CONFIG_FAIR_GROUP_SCHED.
    
    Fix the problem.
    
      42277   10598     320   53195    cfcb kernel/sched.o-before_this_patch
      42216   10598     320   53134    cf8e kernel/sched.o-after_this_patch
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 1843963593587ad589e7f3d70a10b934b7581644
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: group scheduler, fix coding style issues
    
    Fix coding style issues reported by Randy Dunlap and others
    
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 0c5db02d346bd69a2645ba701e727f8c3d93f85c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: cleanup, remove stale comment
    
    cleanup, remove stale comment.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 1a8f5b996c13b181a508a696d1ab24deb918d5fd
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: speed up and simplify vslice calculations
    
    speed up and simplify vslice calculations.
    
    [ From: Mike Galbraith <efault@gmx.de>: build fix ]
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 42452c4bc0be79cf890ceab4838f61ba7ce3fe8f
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: clean up min_vruntime use
    
    clean up min_vruntime use.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 34a8006bf85a9532441a1ffaf3ca10994850bed1
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: group scheduler SMP migration fix
    
    group scheduler SMP migration fix.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 3a5bced3910e4507bbfe4e458ab5f4a821b343dc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: clean up schedstats, cnt -> count
    
    rename all 'cnt' fields and variables to the less yucky 'count' name.
    
    yuckage noticed by Andrew Morton.
    
    no change in code, other than the /proc/sched_debug bkl_count string got
    a bit larger:
    
       text    data     bss     dec     hex filename
      38236    3506      24   41766    a326 sched.o.before
      38240    3506      24   41770    a32a sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit e08ebd3fab911fc90b98e866374982473cb586d5
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: yield fix
    
    fix yield bugs due to the current-not-in-rbtree changes.
    
    [ From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>: build fix. ]
    
    also, nice code size reduction:
    
    kernel/sched.o:
       text    data     bss     dec     hex filename
      38323    3506      24   41853    a37d sched.o.before
      38236    3506      24   41766    a326 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>

commit 3e24baad4959f538c76299c2181828901557b778
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: group scheduler wakeup latency fix
    
    group scheduler wakeup latency fix.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit a57ad348a2b796cbe4d3178db38859291fd6f55e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:22 2007 +0200

    sched: remove set_leftmost()
    
    Lee Schermerhorn noticed that set_leftmost() contains dead code,
    remove this.
    
    Reported-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 864c6e55231eb5b92651ffc143bc9968c7d33e0f
Author: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: clean up sched_fork()
    
    The adjusting sched_class is a missing part of the already existing "do
    not leak PI boosting priority to the child" at the sched_fork(). This
    patch moves the adjusting sched_class from wake_up_new_task() to
    sched_fork().
    
    this also shrinks the code a bit:
    
       text    data     bss     dec     hex filename
      40111    4018     292   44421    ad85 sched.o.before
      40102    4018     292   44412    ad7c sched.o.after
    
    Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit ab758fd487b358dbd995baf6c7c0c1697cd189b5
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: max_vruntime() simplification
    
    max_vruntime() simplification.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 6cf96884ef0f26722c848beb092ff9a3d60922a5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: fix sched_fork()
    
    fix sched_fork().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit e02f773463f01f1e570b20c80c78928d6312ed03
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: fix place_entity()
    
    fix place_entity().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit f94c4c2f86b4d7755bff4f54bfe3881790e0dc78
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: undo some of the recent changes
    
    undo some of the recent changes that are not needed after all,
    such as last_min_vruntime.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 1b491e1abcce7937e01516474f3865181d82c610
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: remove last_min_vruntime effect
    
    remove last_min_vruntime effect.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 0373b3cbb6514fffa087e6523a042ae3f4a1ff7f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: remove condition from set_task_cpu()
    
    remove condition from set_task_cpu(). Now that ->vruntime
    is not global anymore, it should work fine without it too.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit b06e5b5a42172711dbc673581a27fd7c27d2d06d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: entity_key() fix
    
    entity_key() fix.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit f8025ca8751c7992dd4296161754ea523a8bf778
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched debug: check spread
    
    check spread.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit a070ceb0379b182a57fa7e08fcf397055793e2a7
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched debug: more width for parameter printouts
    
    more width for parameter printouts in /proc/sched_debug.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 14e0179c0c0c35b133902a5782910dad51b7d625
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: add vslice
    
    add vslice.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 460b8ac6b91d361702d5706b6dec5b435af7d123
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched debug: print settings
    
    print the current value of all tunables in /proc/sched_debug output.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit b85a1e32c77a9ac0152ec56d6e6c7f246d7ea147
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched: remove unneeded tunables
    
    remove unneeded tunables.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 308e5438bb65831da715433a813fc94d2c4c8982
Author: S.Caglar Onur <caglar@pardus.org.tr>
Date:   Wed Sep 26 15:37:21 2007 +0200

    sched debug: BKL usage statistics, fix
    
    build fix for the SCHED_DEBUG && !SCHEDSTATS case.
    
    Signed-off-by: S.Ceglar Onur <caglar@pardus.org.tr>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 1ca40c5148f3144adba382e45dabce644d577f51
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched debug: BKL usage statistics
    
    add per task and per rq BKL usage statistics.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 6f6e09a9fe6090f6761ff32f5a777d2c0eae3782
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: enable CONFIG_FAIR_GROUP_SCHED=y by default
    
    enable CONFIG_FAIR_GROUP_SCHED=y by default.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 24f079b72d614c87a13bf53e527119cbecaf2ca4
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: fair-group sched, cleanups
    
    fair-group sched, cleanups.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 55e846a477ac5611223c45a20b9973416f3e73eb
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: add fair-user scheduler
    
    Enable user-id based fair group scheduling. This is useful for anyone
    who wants to test the group scheduler w/o having to enable
    CONFIG_CGROUPS.
    
    A separate scheduling group (i.e struct task_grp) is automatically created for
    every new user added to the system. Upon uid change for a task, it is made to
    move to the corresponding scheduling group.
    
    A /proc tunable (/proc/root_user_share) is also provided to tune root
    user's quota of cpu bandwidth.
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 913652948605aa16ee591a4c3b04bfcba0a8b35a
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: clean up code under CONFIG_FAIR_GROUP_SCHED
    
    With the view of supporting user-id based fair scheduling (and not just
    container-based fair scheduling), this patch renames several functions
    and makes them independent of whether they are being used for container
    or user-id based fair scheduling.
    
    Also fix a problem reported by KAMEZAWA Hiroyuki (wrt allocating
    less-sized array for tg->cfs_rq[] and tf->se[]).
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit f6365914c69fe839a6a8c650e8844549906bc10d
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: print &rq->cfs stats
    
    - Print &rq->cfs statistics as well (useful for group scheduling)
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 10beab006a7d4abaed84e47cd259f8fc66bb3c3c
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: print nr_running and load in /proc/sched_debug
    
    - print nr_running and load information for cfs_rq in /proc/sched_debug
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 4b369e92819ef7b6811b7c63e22bd84e765df51c
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: fix minor bug in yield
    
    - fix a minor bug in yield (seen for CONFIG_FAIR_GROUP_SCHED)
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit ed674758ec00f53e5cb97be864a6b5d8740cba96
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: revert recent removal of set_curr_task()
    
    Revert removal of set_curr_task.
    Use put_prev_task/set_curr_task when changing groups/policies
    
    Signed-off-by: Srivatsa Vaddagiri < vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit edfd9210e76f508d89da3231289705647240bcac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: kernel/sched_fair.c whitespace cleanups
    
    some trivial whitespace cleanups.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 9a2e41298a1d8edf1d9c381618647a6ccce6bec1
Author: Mike Galbraith <efault@gmx.de>
Date:   Wed Sep 26 15:37:20 2007 +0200

    sched: fix formatting of /proc/sched_debug
    
    fix formatting of /proc/sched_debug
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit c0da4db02d9893d637c8dbe701596c2606cb48fe
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:19 2007 +0200

    sched: enhance debug output
    
    enhance debug output by changing 12345678 nsecs to 12.345678 output,
    this is more human-readable.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit e37686d196cc8a15c40af6a12a002d731a246aaf
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:19 2007 +0200

    sched: prettify /proc/sched_debug output
    
    print the correct amount of dashes in /proc/sched_debug.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 4a4f01cc78e74f3914f33c3ead2ac9e717779012
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:19 2007 +0200

    sched: rework enqueue/dequeue_entity() to get rid of set_curr_task()
    
    rework enqueue/dequeue_entity() to get rid of
    sched_class::set_curr_task(). This simplifies sched_setscheduler(),
    rt_mutex_setprio() and sched_move_tasks().
    
       text    data     bss     dec     hex filename
      24330    2734      20   27084    69cc sched.o.before
      24233    2730      20   26983    6967 sched.o.after
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 96579013f44fc9797e689938d2f976dae99e0278
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:19 2007 +0200

    sched: simplify sched_class::yield_task()
    
    the 'p' (task_struct) parameter in the sched_class :: yield_task() is
    redundant as the caller is always the 'current'. Get rid of it.
    
       text    data     bss     dec     hex filename
      24341    2734      20   27095    69d7 sched.o.before
      24330    2734      20   27084    69cc sched.o.after
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 2ef878662ef152d9a6da34a997ae2457c5cbc64c
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: optimize task_new_fair()
    
    due to the fact that we no longer keep the 'current' within the tree,
    dequeue/enqueue_entity() is useless for the 'current' in
    task_new_fair(). We are about to reschedule and
    sched_class->put_prev_task() will put the 'current' back into the tree,
    based on its new key.
    
       text    data     bss     dec     hex filename
      24388    2734      20   27142    6a06 sched.o.before
      24341    2734      20   27095    69d7 sched.o.after
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit c5577ee45fb24772c0f5fe1a1046399e8acfab33
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: fix delay accounting performance regression
    
    fix delay accounting performance regression - those sched_clock()
    calls are not needed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit c361ab6e30354175bda62d73f40a7938cab13fa0
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: do not keep current in the tree and get rid of sched_entity::fair_key
    
    Get rid of 'sched_entity::fair_key'.
    
    As a side effect, 'current' is not kept withing the tree for
    SCHED_NORMAL/BATCH tasks anymore. This simplifies some parts of code
    (e.g. entity_tick() and yield_task_fair()) and also somewhat optimizes
    them (e.g. a single update_curr() now vs. dequeue/enqueue() before in
    entity_tick()).
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 0754b0004b25e75652219de9e0f703fc0ad617c1
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: add set_curr_task() calls
    
    p->sched_class->set_curr_task() has to be called before
    activate_task()/enqueue_task() in rt_mutex_setprio(),
    sched_setschedule() and sched_move_task() in order to set up
    'cfs_rq->curr'. The logic of enqueueing depends on whether a task to be
    inserted is 'current' or not.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit b7b3af0d13fda1e7829548794343eae44be23492
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: sched_setscheduler() fix
    
    Fix a problem in the 'sched-group' patch for !CONFIG_FAIR_GROUP_SCHED.
    
    description:
    
    sched_setscheduler()
    {
    ...
    if (task_running()) p->sched_class->put_prev_entity();
    
    [ this one sets up cfs_rq->curr to NULL ]
    
    ...
    
    if (task_running) p->sched_class->set_curr_task();
    
    [ and this one is a _NOP_ (empty) for !CONFIG_FAIR_GROUP_SCHED ]
    
    As a result, the task continues to run with cfs_rq->curr == NULL... no
    crashes (due to checks for !NULL in place) but e.g. update_curr()
    effectively becomes a NOP... i.e. runtime statistics for this task is
    not accounted untill it's rescheduled anew.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 0b9edda72301c35cc00d3b02a3dd4052373d6052
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Wed Sep 26 15:37:18 2007 +0200

    sched: group-scheduler core
    
    Add interface to control cpu bandwidth allocation to task-groups.
    
    (not yet configurable, due to missing CONFIG_CONTAINERS)
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit db7c2a4c0fe9ce14e833ace213c66aa66ae9b998
Author: Mike Galbraith <efault@gmx.de>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: fix SMP migration latencies
    
    fix SMP migration latencies.
    
    Signed-off-by: Mike Galbraith <efault@gmx.de>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit e00d85cf8a2851761231e3633c50a5d810b066fd
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: better min_vruntime tracking
    
    Better min_vruntime tracking.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit e665fd72ebfe4470b2dbd35eb64072e8ae8f747f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: x86: allow single-depth wchan output
    
    sched.o gets smaller and faster if we compile it with -fomit-frame-pointers,
    so make this a config option. The cost is the loss of multi-depth wchan
    lookups - but SysRq-T is a sufficient replacement for them anyway, so their
    utility is much lower these days.
    
    the size difference is significant:
    
       text    data     bss     dec     hex filename
      34005    3462      24   37491    9273 sched.o.before
      33470    3462      24   36956    905c sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 09c8634f98b2d7d91012e2a1de8b0fa02f89a29b
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: clean up schedstat block in dequeue_entity()
    
    Better placement of #ifdef CONFIG_SCHEDSTAT block in dequeue_entity().
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit b5719e8d9bb743c77f854f244dc77cc06fd3f57c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: remove wait_runtime fields and features
    
    remove wait_runtime based fields and features.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 7fdcc45e5018ac7b4ebb353b72808c5e7607a299
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: remove wait_runtime limit
    
    remove the wait_runtime-limit fields.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 630af17cd68c0ecf9c73172c30aba11c7850915a
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Wed Sep 26 15:37:14 2007 +0200

    sched: clean up struct load_stat
    
    'struct load_stat' is redundant now so let's get rid of it.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 8e6305912e3ad1e7c9a266cd2d35e3e5cfb06e62
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: debug: update exec_clock only when SCHED_DEBUG
    
    micro-optimization: update cfs_rq->exec_clock only if
    CONFIG_SCHED_DEBUG=y.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 83ba30df801e707c51df7cac353b6270a6d82c17
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: add more vruntime statistics
    
    add more vruntime statistics.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 8e34379b4d9f80d49900b337b4a873011cc02d84
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: handle vruntime overflow
    
    Handle vruntime overflow by centering the key space around min_vruntime.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 48d24bbc6eeacaa7fb1827950021261bb3f3256b
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: add tree based averages
    
    add support for tree based vruntime averages.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit fe10a9d058731bf4bb3894061cb28b11ceb036e1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: add se->vruntime debugging
    
    debug se->vruntime fields.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 90ec7f52207fda99d7b6905eec603d3d5dc2f3fc
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: clean up new task placement
    
    clean up new task placement.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 9d6d8ba5dd93061c5ca19fa1ef704a22681d7cb8
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: wakeup granularity fix
    
    fix wakeup granularity. (and increase the default)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 487d459f05b6bbe790d088192f9dfe9955855e68
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: simplify check_preempt() methods
    
    simplify the check_preempt() methods.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 7917d1a3084c5bf2f77fc685fd89a02617a95b9a
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: simplify adaptive latency
    
    simplify adaptive latency.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit f87c3c33d1c919d2b5c6c8e1e0094944169e477f
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: new task placement for vruntime
    
    add proper new task placement for the vruntime based math too.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 65d87356c031f111b0cee14ce24f4f2fd3c1c7b3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: optimize vruntime based scheduling
    
    optimize vruntime based scheduling.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 02de25f8f37a58d65f8695ade2d0a616cc3011c3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: move sched_feat() definitions
    
    move sched_feat() definitions so that it can be used sooner by generic
    code too.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 417ad3bd02f872f7aa87f39a5bd60a2820cd74b3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: introduce se->vruntime
    
    introduce se->vruntime as a sum of weighted delta-exec's, and use that
    as the key into the tree.
    
    the idea to use absolute virtual time as the basic metric of scheduling
    has been first raised by William Lee Irwin, advanced by Tong Li and first
    prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset.
    
    also see:
    
       http://lkml.org/lkml/2007/9/2/76
    
    for a simpler variant of this patch.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 2d2a99ca0ff4bd3e3d6974d22a1a97541c4a0cbf
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: clean up calc_weighted()
    
    clean up calc_weighted() - we always use the normalized shift so
    it's not needed to pass that in. Also, push the non-nice0 branch
    into the function.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 945ddf19bc7da7005c8afacdd5b8a7d3d45763f6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: speed up update_load_add/_sub()
    
    speed up update_load_add/_sub() by not delaying the division - this
    reduces CPU pipeline dependencies.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 1905eaaf575557879de5d9e1e60a4388c6498baa
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: uninline __enqueue_entity()/__dequeue_entity()
    
    suggested by Roman Zippel: uninline __enqueue_entity() and
    __dequeue_entity().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 31dd93aed83cbb3b13492933eb7647b3cb4ae208
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Wed Sep 26 15:37:13 2007 +0200

    sched: simplify SCHED_FEAT_* code
    
    Peter Zijlstra suggested to simplify SCHED_FEAT_* checks via the
    sched_feat(x) macro.
    
    No code changed:
    
       text    data     bss     dec     hex filename
       38895    3550      24   42469    a5e5 sched.o.before
       38895    3550      24   42469    a5e5 sched.o.after
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit b0af50c6f8f422fd3822df49255f01ff4a8d5bac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: cleanup: simplify cfs_rq_curr() methods
    
    cleanup: simplify cfs_rq_curr() methods - now that the cfs_rq->curr
    pointer is unconditionally present, remove the wrappers.
    
      kernel/sched.o:
          text    data     bss     dec     hex filename
         11784     224    2012   14020    36c4 sched.o.before
         11784     224    2012   14020    36c4 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 35f97a75af84de26097b0d54aa20b752915ff455
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: track cfs_rq->curr on !group-scheduling too
    
    Noticed by Roman Zippel: use cfs_rq->curr in the !group-scheduling
    case too. Small micro-optimization and cleanup effect:
    
       text    data     bss     dec     hex filename
       36269    3482      24   39775    9b5f sched.o.before
       36177    3486      24   39687    9b07 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 885ad07f8fa1f0ac4d8f2ade84d9668d97c3dfe6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: remove precise CPU load calculations #2
    
    continued removal of precise CPU load calculations.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 66105294364833f6a1fbe4047140c7d25507a780
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: remove precise CPU load
    
    CPU load calculations are statistical anyway, and there's little benefit
    from having it calculated on every scheduling event. So remove this code,
    it gets rid of a divide from the scheduler wakeup and context-switch
    fastpath.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 0ca06b358a16d910d91735455e9cbbd075d78832
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: remove stat_gran
    
    remove the stat_gran code - it was disabled by default and it causes
    unnecessary overhead.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 60c3a1350a8f991642d47ca042d90cff0d85cf2f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: use constants if !CONFIG_SCHED_DEBUG
    
    use constants if !CONFIG_SCHED_DEBUG.
    
    this speeds up the code and reduces code-size:
    
        text    data     bss     dec     hex filename
       27464    3014      16   30494    771e sched.o.before
       26929    3010      20   29959    7507 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 616aeab730a7a0edb4c1d5f532e41d7745a9d4b0
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: uniform tunings
    
    use uniform tunings on UP and SMP alike.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit efb89441c1bf02bb02f89a545be09e91abcd0f2a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: debug: track maximum 'slice'
    
    track the maximum amount of time a task has executed while
    the CPU load was at least 2x. (i.e. at least two nice-0
    tasks were runnable)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit f3bb3e906b07e769eae93e21b4342878ddbfef0e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: small sched_debug cleanup
    
    small kernel/sched_debug.c cleanup - break up
    multi-variable assignment.
    
    no code changed:
    
       text    data     bss     dec     hex filename
       38869    3550      24   42443    a5cb sched.o.before
       38869    3550      24   42443    a5cb sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit b3cfdd7ddd69af875700ff63b2ca56a6aa928c90
Author: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: use list_for_each_entry_safe() in __wake_up_common()
    
    Use list_for_each_entry_safe() instead of list_for_each_safe() in
    __wake_up_common()
    
    Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit d22830a32b9efffc8685882493c0bd203659fde8
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: resched task in task_new_fair()
    
    to get full child-runs-first semantics make sure the parent is
    rescheduled.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 0f54e55810fd0796cb49585ea6e17e3ca62dd9b1
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Sep 26 15:37:12 2007 +0200

    sched: fix new-task method
    
    always call into ->task_new().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/i386/Kconfig       |   11 
 fs/proc/base.c          |    2 
 include/linux/sched.h   |   57 +-
 init/Kconfig            |   21 +
 kernel/delayacct.c      |    2 
 kernel/sched.c          |  577 +++++++++++++++++++----------
 kernel/sched_debug.c    |  246 ++++++++----
 kernel/sched_fair.c     |  735 ++++++++++++++------------------------
 kernel/sched_idletask.c |    5 
 kernel/sched_rt.c       |   12 
 kernel/sched_stats.h    |   28 -
 kernel/sysctl.c         |   31 -
 kernel/user.c           |   43 ++
 13 files changed, 964 insertions(+), 806 deletions(-)

diff -puN arch/i386/Kconfig~git-sched arch/i386/Kconfig
--- a/arch/i386/Kconfig~git-sched
+++ a/arch/i386/Kconfig
@@ -214,6 +214,17 @@ config X86_ES7000
 
 endchoice
 
+config SCHED_NO_NO_OMIT_FRAME_POINTER
+	bool "Single-depth WCHAN output"
+	default y
+	help
+	  Calculate simpler /proc/<PID>/wchan values. If this option
+	  is disabled then wchan values will recurse back to the
+	  caller function. This provides more accurate wchan values,
+	  at the expense of slightly more scheduling overhead.
+
+	  If in doubt, say "Y".
+
 config PARAVIRT
 	bool "Paravirtualization support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff -puN fs/proc/base.c~git-sched fs/proc/base.c
--- a/fs/proc/base.c~git-sched
+++ a/fs/proc/base.c
@@ -304,7 +304,7 @@ static int proc_pid_schedstat(struct tas
 	return sprintf(buffer, "%llu %llu %lu\n",
 			task->sched_info.cpu_time,
 			task->sched_info.run_delay,
-			task->sched_info.pcnt);
+			task->sched_info.pcount);
 }
 #endif
 
diff -puN include/linux/sched.h~git-sched include/linux/sched.h
--- a/include/linux/sched.h~git-sched
+++ a/include/linux/sched.h
@@ -135,6 +135,7 @@ extern unsigned long weighted_cpuload(co
 
 struct seq_file;
 struct cfs_rq;
+struct task_grp;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
@@ -595,6 +596,10 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+	struct task_grp *tg;
+#endif
 };
 
 extern struct user_struct *find_user(uid_t);
@@ -608,13 +613,17 @@ struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
-	unsigned long pcnt;	      /* # of times run on this cpu */
+	unsigned long pcount;	      /* # of times run on this cpu */
 	unsigned long long cpu_time,  /* time spent on the cpu */
 			   run_delay; /* time spent waiting on a runqueue */
 
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
+#ifdef CONFIG_SCHEDSTATS
+	/* BKL stats */
+	unsigned long bkl_count;
+#endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 
@@ -749,7 +758,7 @@ struct sched_domain {
 
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
-	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+	unsigned long lb_count[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
 	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
@@ -759,17 +768,17 @@ struct sched_domain {
 	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
 
 	/* Active load balancing */
-	unsigned long alb_cnt;
+	unsigned long alb_count;
 	unsigned long alb_failed;
 	unsigned long alb_pushed;
 
 	/* SD_BALANCE_EXEC stats */
-	unsigned long sbe_cnt;
+	unsigned long sbe_count;
 	unsigned long sbe_balanced;
 	unsigned long sbe_pushed;
 
 	/* SD_BALANCE_FORK stats */
-	unsigned long sbf_cnt;
+	unsigned long sbf_count;
 	unsigned long sbf_balanced;
 	unsigned long sbf_pushed;
 
@@ -857,7 +866,7 @@ struct sched_class {
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
-	void (*yield_task) (struct rq *rq, struct task_struct *p);
+	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
@@ -887,31 +896,21 @@ struct load_weight {
  *     4 se->block_start
  *     4 se->run_node
  *     4 se->sleep_start
- *     4 se->sleep_start_fair
  *     6 se->load.weight
- *     7 se->delta_fair
- *    15 se->wait_runtime
  */
 struct sched_entity {
-	long			wait_runtime;
-	unsigned long		delta_fair_run;
-	unsigned long		delta_fair_sleep;
-	unsigned long		delta_exec;
-	s64			fair_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
+	u64			vruntime;
 	u64			prev_sum_exec_runtime;
-	u64			wait_start_fair;
-	u64			sleep_start_fair;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
-	s64			sum_wait_runtime;
 
 	u64			sleep_start;
 	u64			sleep_max;
@@ -920,9 +919,7 @@ struct sched_entity {
 	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
-
-	unsigned long		wait_runtime_overruns;
-	unsigned long		wait_runtime_underruns;
+	u64			slice_max;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1400,15 +1397,16 @@ static inline void idle_task_exit(void) 
 
 extern void sched_idle_next(void);
 
+#ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_nr_latency;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
-extern unsigned int sysctl_sched_stat_granularity;
-extern unsigned int sysctl_sched_runtime_limit;
-extern unsigned int sysctl_sched_compat_yield;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+#endif
+
+extern unsigned int sysctl_sched_compat_yield;
 
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
@@ -1842,6 +1840,17 @@ extern int sched_mc_power_savings, sched
 
 extern void normalize_rt_tasks(void);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+extern struct task_grp init_task_grp;
+
+extern struct task_grp *sched_create_group(void);
+extern void sched_destroy_group(struct task_grp *tg);
+extern void sched_move_task(struct task_struct *tsk);
+extern int sched_group_set_shares(struct task_grp *tg, unsigned long shares);
+
+#endif
+
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff -puN init/Kconfig~git-sched init/Kconfig
--- a/init/Kconfig~git-sched
+++ a/init/Kconfig
@@ -285,6 +285,27 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config FAIR_GROUP_SCHED
+	bool "Fair group CPU scheduler"
+	default y
+	depends on EXPERIMENTAL
+	help
+	  This feature lets CPU scheduler recognize task groups and control CPU
+	  bandwidth allocation to such task groups.
+
+choice
+	depends on FAIR_GROUP_SCHED
+	prompt "Basis for grouping tasks"
+	default FAIR_USER_SCHED
+
+config FAIR_USER_SCHED
+	bool "user id"
+	help
+	  This option will choose userid as the basis for grouping
+	  tasks, thus providing equal CPU bandwidth to each user.
+
+endchoice
+
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
 	default y
diff -puN kernel/delayacct.c~git-sched kernel/delayacct.c
--- a/kernel/delayacct.c~git-sched
+++ a/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats
 	 * No locking available for sched_info (and too expensive to add one)
 	 * Mitigate by taking snapshot of values
 	 */
-	t1 = tsk->sched_info.pcnt;
+	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
 	t3 = tsk->sched_info.cpu_time;
 
diff -puN kernel/sched.c~git-sched kernel/sched.c
--- a/kernel/sched.c~git-sched
+++ a/kernel/sched.c
@@ -170,31 +170,89 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
-struct load_stat {
-	struct load_weight load;
-	u64 load_update_start, load_update_last;
-	unsigned long delta_fair, delta_exec, delta_stat;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+	/* schedulable entities of this group on each cpu */
+	struct sched_entity **se;
+	/* runqueue "owned" by this group on each cpu */
+	struct cfs_rq **cfs_rq;
+	unsigned long shares;
 };
 
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+
+/* Default task group.
+ * 	Every task in system belong to this group at bootup.
+ */
+struct task_grp init_task_grp =  {
+				.se     = init_sched_entity_p,
+				.cfs_rq = init_cfs_rq_p,
+				 };
+
+#ifdef CONFIG_FAIR_USER_SCHED
+#define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+#else
+#define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+#endif
+
+static int init_task_grp_load = INIT_TASK_GRP_LOAD;
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+	struct task_grp *tg;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+	tg = p->user->tg;
+#else
+	tg  = &init_task_grp;
+#endif
+
+	return tg;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+	p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
-	s64 fair_clock;
 	u64 exec_clock;
-	s64 wait_runtime;
-	u64 sleeper_bonus;
-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+	u64 min_vruntime;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
+
+	unsigned long nr_spread_over;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
 	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -205,6 +263,8 @@ struct cfs_rq {
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+	struct task_grp *tg;    /* group that "owns" this runqueue */
+	struct rcu_head rcu;
 #endif
 };
 
@@ -236,7 +296,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
-	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
+	struct load_weight load;	/* capture load from *all* tasks on this cpu */
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 
@@ -288,16 +348,19 @@ struct rq {
 	unsigned long yld_exp_empty;
 	unsigned long yld_act_empty;
 	unsigned long yld_both_empty;
-	unsigned long yld_cnt;
+	unsigned long yld_count;
 
 	/* schedule() stats */
 	unsigned long sched_switch;
-	unsigned long sched_cnt;
+	unsigned long sched_count;
 	unsigned long sched_goidle;
 
 	/* try_to_wake_up() stats */
-	unsigned long ttwu_cnt;
+	unsigned long ttwu_count;
 	unsigned long ttwu_local;
+
+	/* BKL stats */
+	unsigned long bkl_count;
 #endif
 	struct lock_class_key rq_lock_key;
 };
@@ -382,6 +445,33 @@ static void update_rq_clock(struct rq *r
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
 /*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_START_DEBIT		= 2,
+	SCHED_FEAT_USE_TREE_AVG         = 4,
+	SCHED_FEAT_APPROX_AVG           = 8,
+};
+
+const_debug unsigned int sysctl_sched_features =
+		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_START_DEBIT		*1 |
+		SCHED_FEAT_USE_TREE_AVG		*0 |
+		SCHED_FEAT_APPROX_AVG		*0;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
+/*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
@@ -400,18 +490,6 @@ unsigned long long cpu_clock(int cpu)
 	return now;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-	p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -644,19 +722,6 @@ static inline void resched_task(struct t
 }
 #endif
 
-static u64 div64_likely32(u64 divident, unsigned long divisor)
-{
-#if BITS_PER_LONG == 32
-	if (likely(divident <= 0xffffffffULL))
-		return (u32)divident / divisor;
-	do_div(divident, divisor);
-
-	return divident;
-#else
-	return divident / divisor;
-#endif
-}
-
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
@@ -698,16 +763,14 @@ calc_delta_fair(unsigned long delta_exec
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
 
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-	lw->inv_weight = 0;
 }
 
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
-	lw->inv_weight = 0;
 }
 
 /*
@@ -792,20 +855,11 @@ static int balance_tasks(struct rq *this
 
 #define sched_class_highest (&rt_sched_class)
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
 /*
  * Update delta_exec, delta_fair fields for rq.
  *
  * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
+ * total load (rq->load.weight) on the runqueue, while
  * delta_exec advances at the same rate as wall-clock (provided
  * cpu is not idle).
  *
@@ -813,35 +867,17 @@ static void __update_curr_load(struct rq
  * runqueue over any given interval. This (smoothened) load is used
  * during load balance.
  *
- * This function is called /before/ updating rq->ls.load
+ * This function is called /before/ updating rq->load
  * and when switching tasks.
  */
-static void update_curr_load(struct rq *rq)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = rq->clock;
-	ls->delta_stat += rq->clock - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
-}
-
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
-	update_load_add(&rq->ls.load, p->se.load.weight);
+	update_load_add(&rq->load, p->se.load.weight);
 }
 
 static inline void dec_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
+	update_load_sub(&rq->load, p->se.load.weight);
 }
 
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -858,8 +894,6 @@ static void dec_nr_running(struct task_s
 
 static void set_load_weight(struct task_struct *p)
 {
-	p->se.wait_runtime = 0;
-
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -988,15 +1022,15 @@ inline int task_curr(const struct task_s
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->ls.load.weight;
+	return cpu_rq(cpu)->load.weight;
 }
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_SMP
 	task_thread_info(p)->cpu = cpu;
-	set_task_cfs_rq(p);
 #endif
+	set_task_cfs_rq(p);
 }
 
 #ifdef CONFIG_SMP
@@ -1005,15 +1039,11 @@ void set_task_cpu(struct task_struct *p,
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	u64 clock_offset, fair_clock_offset;
+	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+	u64 clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
-	if (p->se.wait_start_fair)
-		p->se.wait_start_fair -= fair_clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
@@ -1023,6 +1053,8 @@ void set_task_cpu(struct task_struct *p,
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
+	p->se.vruntime -= old_cfsrq->min_vruntime -
+					 new_cfsrq->min_vruntime;
 
 	__set_task_cpu(p, new_cpu);
 }
@@ -1451,7 +1483,7 @@ static int try_to_wake_up(struct task_st
 
 	new_cpu = cpu;
 
-	schedstat_inc(rq, ttwu_cnt);
+	schedstat_inc(rq, ttwu_count);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		goto out_set_cpu;
@@ -1583,28 +1615,20 @@ int fastcall wake_up_state(struct task_s
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.delta_exec		= 0;
-	p->se.delta_fair_run		= 0;
-	p->se.delta_fair_sleep		= 0;
-	p->se.wait_runtime		= 0;
-	p->se.sleep_start_fair		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
-	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
-	p->se.wait_runtime_overruns	= 0;
-	p->se.wait_runtime_underruns	= 0;
 #endif
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -1635,12 +1659,14 @@ void sched_fork(struct task_struct *p, i
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
-	__set_task_cpu(p, cpu);
+	set_task_cpu(p, cpu);
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
+	if (!rt_prio(p->prio))
+		p->sched_class = &fair_sched_class;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
@@ -1657,12 +1683,6 @@ void sched_fork(struct task_struct *p, i
 }
 
 /*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
- */
-unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
-
-/*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
@@ -1687,15 +1707,8 @@ void fastcall wake_up_new_task(struct ta
 	else
 		p->sched_class = &fair_sched_class;
 
-	if (rt_prio(p->prio))
-		p->sched_class = &rt_sched_class;
-	else
-		p->sched_class = &fair_sched_class;
-
-	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
-			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
-			!current->se.on_rq) {
-
+	if (task_cpu(p) != this_cpu || !p->sched_class->task_new ||
+							!current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
@@ -1986,42 +1999,10 @@ unsigned long nr_active(void)
  */
 static void update_cpu_load(struct rq *this_rq)
 {
-	u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
-	unsigned long total_load = this_rq->ls.load.weight;
-	unsigned long this_load =  total_load;
-	struct load_stat *ls = &this_rq->ls;
+	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
-	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
-		goto do_avg;
-
-	/* Update delta_fair/delta_exec fields first */
-	update_curr_load(this_rq);
-
-	fair_delta64 = ls->delta_fair + 1;
-	ls->delta_fair = 0;
-
-	exec_delta64 = ls->delta_exec + 1;
-	ls->delta_exec = 0;
-
-	sample_interval64 = this_rq->clock - ls->load_update_last;
-	ls->load_update_last = this_rq->clock;
-
-	if ((s64)sample_interval64 < (s64)TICK_NSEC)
-		sample_interval64 = TICK_NSEC;
-
-	if (exec_delta64 > sample_interval64)
-		exec_delta64 = sample_interval64;
-
-	idle_delta64 = sample_interval64 - exec_delta64;
-
-	tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
-	tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
-	this_load = (unsigned long)tmp64;
-
-do_avg:
 
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2031,7 +2012,13 @@ do_avg:
 
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
-
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
@@ -2657,7 +2644,7 @@ static int load_balance(int this_cpu, st
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[idle]);
+	schedstat_inc(sd, lb_count[idle]);
 
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
@@ -2810,7 +2797,7 @@ load_balance_newidle(int this_cpu, struc
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 
-	schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
+	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
@@ -2944,7 +2931,7 @@ static void active_load_balance(struct r
 	}
 
 	if (likely(sd)) {
-		schedstat_inc(sd, alb_cnt);
+		schedstat_inc(sd, alb_count);
 
 		if (move_one_task(target_rq, target_cpu, busiest_rq,
 				  sd, CPU_IDLE))
@@ -3434,7 +3421,13 @@ static inline void schedule_debug(struct
 
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-	schedstat_inc(this_rq(), sched_cnt);
+	schedstat_inc(this_rq(), sched_count);
+#ifdef CONFIG_SCHEDSTATS
+	if (unlikely(prev->lock_depth >= 0)) {
+		schedstat_inc(this_rq(), bkl_count);
+		schedstat_inc(prev, sched_info.bkl_count);
+	}
+#endif
 }
 
 /*
@@ -3640,10 +3633,9 @@ EXPORT_SYMBOL(default_wake_function);
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
-	struct list_head *tmp, *next;
+	wait_queue_t *curr, *next;
 
-	list_for_each_safe(tmp, next, &q->task_list) {
-		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, sync, key) &&
@@ -3951,7 +3943,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
-	int oldprio, on_rq;
+	int oldprio, on_rq, running;
 	struct rq *rq;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3961,8 +3953,12 @@ void rt_mutex_setprio(struct task_struct
 
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
 
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
@@ -3972,13 +3968,15 @@ void rt_mutex_setprio(struct task_struct
 	p->prio = prio;
 
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4184,7 +4182,7 @@ __setscheduler(struct rq *rq, struct tas
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1, on_rq;
+	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4266,18 +4264,26 @@ recheck:
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	running = task_running(rq, p);
+	if (on_rq) {
 		deactivate_task(rq, p, 0);
+		if (running)
+			p->sched_class->put_prev_task(rq, p);
+	}
+
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
+
 	if (on_rq) {
+		if (running)
+			p->sched_class->set_curr_task(rq);
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
-		if (task_running(rq, p)) {
+		if (running) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
@@ -4559,8 +4565,8 @@ asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 
-	schedstat_inc(rq, yld_cnt);
-	current->sched_class->yield_task(rq, current);
+	schedstat_inc(rq, yld_count);
+	current->sched_class->yield_task(rq);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4904,32 +4910,6 @@ void __cpuinit init_idle(struct task_str
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static inline void sched_init_granularity(void)
-{
-	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long limit = 100000000;
-
-	sysctl_sched_min_granularity *= factor;
-	if (sysctl_sched_min_granularity > limit)
-		sysctl_sched_min_granularity = limit;
-
-	sysctl_sched_latency *= factor;
-	if (sysctl_sched_latency > limit)
-		sysctl_sched_latency = limit;
-
-	sysctl_sched_runtime_limit = sysctl_sched_latency;
-	sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
-}
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -6497,12 +6477,10 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
-	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
@@ -6519,15 +6497,14 @@ int in_sched_functions(unsigned long add
 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
-	cfs_rq->fair_clock = 1;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
+	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 
 void __init sched_init(void)
 {
-	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 
@@ -6550,10 +6527,27 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	 	{
+ 			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ 			struct sched_entity *se =
+ 					 &per_cpu(init_sched_entity, i);
+
+ 			init_cfs_rq_p[i] = cfs_rq;
+ 			init_cfs_rq(cfs_rq, rq);
+ 			cfs_rq->tg = &init_task_grp;
+ 			list_add(&cfs_rq->leaf_cfs_rq_list,
+							 &rq->leaf_cfs_rq_list);
+
+ 			init_sched_entity_p[i] = se;
+ 			se->cfs_rq = &rq->cfs;
+ 			se->my_q = cfs_rq;
+ 			se->load.weight = init_task_grp_load;
+			se->load.inv_weight =
+				 div64_64(1ULL<<32, init_task_grp_load);
+ 			se->parent = NULL;
+ 		}
+		init_task_grp.shares = init_task_grp_load;
 #endif
-		rq->ls.load_update_last = now;
-		rq->ls.load_update_start = now;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -6647,17 +6641,12 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
-		p->se.fair_key			= 0;
-		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
-		p->se.wait_start_fair		= 0;
-		p->se.sleep_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
@@ -6744,3 +6733,197 @@ void set_curr_task(int cpu, struct task_
 }
 
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* allocate runqueue etc for a new task group */
+struct task_grp *sched_create_group(void)
+{
+	struct task_grp *tg;
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	struct rq *rq;
+	int i;
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->cfs_rq)
+		goto err;
+	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->se)
+		goto err;
+
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+							 cpu_to_node(i));
+		if (!cfs_rq)
+			goto err;
+
+		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+							cpu_to_node(i));
+		if (!se)
+			goto err;
+
+		memset(cfs_rq, 0, sizeof(struct cfs_rq));
+		memset(se, 0, sizeof(struct sched_entity));
+
+		tg->cfs_rq[i] = cfs_rq;
+		init_cfs_rq(cfs_rq, rq);
+		cfs_rq->tg = tg;
+
+		tg->se[i] = se;
+		se->cfs_rq = &rq->cfs;
+		se->my_q = cfs_rq;
+		se->load.weight = NICE_0_LOAD;
+		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+		se->parent = NULL;
+	}
+
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
+
+	tg->shares = NICE_0_LOAD;
+
+	return tg;
+
+err:
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq && tg->cfs_rq[i])
+			kfree(tg->cfs_rq[i]);
+		if (tg->se && tg->se[i])
+			kfree(tg->se[i]);
+	}
+	if (tg->cfs_rq)
+		kfree(tg->cfs_rq);
+	if (tg->se)
+		kfree(tg->se);
+	if (tg)
+		kfree(tg);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group(struct rcu_head *rhp)
+{
+	struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
+	struct task_grp *tg = cfs_rq->tg;
+	struct sched_entity *se;
+	int i;
+
+	/* now it should be safe to free those cfs_rqs */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		kfree(cfs_rq);
+
+		se = tg->se[i];
+		kfree(se);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg);
+}
+
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_grp *tg)
+{
+	struct cfs_rq *cfs_rq;
+	int i;
+
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	cfs_rq = tg->cfs_rq[0];
+
+	/* wait for possible concurrent references to cfs_rqs complete */
+	call_rcu(&cfs_rq->rcu, free_sched_group);
+}
+
+/* change task's runqueue when it moves between groups.
+ * 	The caller of this function should have put the task in its new group
+ * 	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ * 	reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+	int on_rq, running;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(tsk, &flags);
+
+	if (tsk->sched_class != &fair_sched_class)
+		goto done;
+
+	update_rq_clock(rq);
+
+	running = task_running(rq, tsk);
+	on_rq = tsk->se.on_rq;
+
+	if (on_rq) {
+		dequeue_task(rq, tsk, 0);
+		if (unlikely(running))
+			tsk->sched_class->put_prev_task(rq, tsk);
+	}
+
+	set_task_cfs_rq(tsk);
+
+	if (on_rq) {
+		if (unlikely(running))
+			tsk->sched_class->set_curr_task(rq);
+		enqueue_task(rq, tsk, 0);
+	}
+
+done:
+	task_rq_unlock(rq, &flags);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	int on_rq;
+
+	spin_lock_irq(&rq->lock);
+
+	on_rq = se->on_rq;
+	if (on_rq)
+		dequeue_entity(cfs_rq, se, 0);
+
+	se->load.weight = shares;
+	se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+	if (on_rq)
+		enqueue_entity(cfs_rq, se, 0);
+
+	spin_unlock_irq(&rq->lock);
+}
+
+int sched_group_set_shares(struct task_grp *tg, unsigned long shares)
+{
+	int i;
+
+	if (tg->shares == shares)
+		return 0;
+
+	/* return -EINVAL if the new value is not sane */
+
+	tg->shares = shares;
+	for_each_possible_cpu(i)
+		set_se_shares(tg->se[i], shares);
+
+	return 0;
+}
+
+#endif 	/* CONFIG_FAIR_GROUP_SCHED */
diff -puN kernel/sched_debug.c~git-sched kernel/sched_debug.c
--- a/kernel/sched_debug.c~git-sched
+++ a/kernel/sched_debug.c
@@ -28,6 +28,31 @@
 		printk(x);			\
  } while (0)
 
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(long long nsec)
+{
+	if (nsec < 0) {
+		nsec = -nsec;
+		do_div(nsec, 1000000);
+		return -nsec;
+	}
+	do_div(nsec, 1000000);
+
+	return nsec;
+}
+
+static unsigned long nsec_low(long long nsec)
+{
+	if (nsec < 0)
+		nsec = -nsec;
+
+	return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
+	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, p->pid,
-		(long long)p->se.fair_key,
-		(long long)(p->se.fair_key - rq->cfs.fair_clock),
-		(long long)p->se.wait_runtime,
+		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		(long long)p->se.sum_exec_runtime,
-		(long long)p->se.sum_wait_runtime,
-		(long long)p->se.sum_sleep_runtime,
-		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
+	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
+		SPLIT_NS(p->se.vruntime),
+		SPLIT_NS(p->se.sum_exec_runtime),
+		SPLIT_NS(p->se.sum_sleep_runtime));
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		0LL, 0LL, 0LL, 0LL, 0LL);
+	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 }
 
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m,
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID        tree-key         delta       waiting"
-	"  switches  prio"
-	"        sum-exec        sum-wait       sum-sleep"
-	"    wait-overrun   wait-underrun\n"
-	"------------------------------------------------------------------"
-	"----------------"
-	"------------------------------------------------"
-	"--------------------------------\n");
+	"            task   PID         tree-key  switches  prio"
+	"     exec-runtime         sum-exec        sum-sleep\n"
+	"------------------------------------------------------"
+	"----------------------------------------------------\n");
 
 	read_lock_irq(&tasklist_lock);
 
@@ -83,45 +100,48 @@ static void print_rq(struct seq_file *m,
 	read_unlock_irq(&tasklist_lock);
 }
 
-static void
-print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 wait_runtime_rq_sum = 0;
-	struct task_struct *p;
-	struct rb_node *curr;
-	unsigned long flags;
+	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+		spread, rq0_min_vruntime, spread0;
 	struct rq *rq = &per_cpu(runqueues, cpu);
+	struct sched_entity *last;
+	unsigned long flags;
 
-	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(cfs_rq);
-	while (curr) {
-		p = rb_entry(curr, struct task_struct, se.run_node);
-		wait_runtime_rq_sum += p->se.wait_runtime;
-
-		curr = rb_next(curr);
-	}
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
-		(long long)wait_runtime_rq_sum);
-}
-
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
 	SEQ_printf(m, "\ncfs_rq\n");
 
-#define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
-
-	P(fair_clock);
-	P(exec_clock);
-	P(wait_runtime);
-	P(wait_runtime_overruns);
-	P(wait_runtime_underruns);
-	P(sleeper_bonus);
-#undef P
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+			SPLIT_NS(cfs_rq->exec_clock));
 
-	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+	spin_lock_irqsave(&rq->lock, flags);
+	if (cfs_rq->rb_leftmost)
+		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+	last = __pick_last_entity(cfs_rq);
+	if (last)
+		max_vruntime = last->vruntime;
+	min_vruntime = rq->cfs.min_vruntime;
+	rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+	spin_unlock_irqrestore(&rq->lock, flags);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+			SPLIT_NS(MIN_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
+			SPLIT_NS(min_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
+			SPLIT_NS(max_vruntime));
+	spread = max_vruntime - MIN_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
+			SPLIT_NS(spread));
+	spread0 = min_vruntime - rq0_min_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
+			SPLIT_NS(spread0));
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SCHEDSTATS
+	SEQ_printf(m, "  .%-30s: %ld\n", "bkl_count",
+			rq->bkl_count);
+#endif
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
+			cfs_rq->nr_spread_over);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
@@ -141,31 +161,32 @@ static void print_cpu(struct seq_file *m
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
 
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
-		   rq->ls.load.weight);
-	P(ls.delta_fair);
-	P(ls.delta_exec);
+		   rq->load.weight);
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
 	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
-	P(next_balance);
+	PN(next_balance);
 	P(curr->pid);
-	P(clock);
-	P(idle_clock);
-	P(prev_clock_raw);
+	PN(clock);
+	PN(idle_clock);
+	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
 	P(clock_deep_idle_events);
-	P(clock_max_delta);
+	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
 #undef P
+#undef PN
 
 	print_cfs_stats(m, cpu);
 
@@ -182,7 +203,20 @@ static int sched_debug_show(struct seq_f
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 
-	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
+	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+
+#define P(x) \
+	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+	PN(sysctl_sched_latency);
+	PN(sysctl_sched_nr_latency);
+	PN(sysctl_sched_wakeup_granularity);
+	PN(sysctl_sched_batch_wakeup_granularity);
+	PN(sysctl_sched_child_runs_first);
+	P(sysctl_sched_features);
+#undef PN
+#undef P
 
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
@@ -197,6 +231,45 @@ static void sysrq_sched_debug_show(void)
 	sched_debug_show(NULL, NULL);
 }
 
+#ifdef CONFIG_FAIR_USER_SCHED
+
+static DEFINE_MUTEX(root_user_share_mutex);
+
+static int
+root_user_share_read_proc(char *page, char **start, off_t off, int count,
+				 int *eof, void *data)
+{
+	return sprintf(page, "%d\n", init_task_grp_load);
+}
+
+static int
+root_user_share_write_proc(struct file *file, const char __user *buffer,
+				 unsigned long count, void *data)
+{
+	unsigned long shares;
+	char kbuf[sizeof(unsigned long)+1];
+	int rc = 0;
+
+	if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
+		return -EFAULT;
+
+	shares = simple_strtoul(kbuf, NULL, 0);
+
+	if (!shares)
+		shares = NICE_0_LOAD;
+
+	mutex_lock(&root_user_share_mutex);
+
+	init_task_grp_load = shares;
+	rc = sched_group_set_shares(&init_task_grp, shares);
+
+	mutex_unlock(&root_user_share_mutex);
+
+	return (rc < 0 ? rc : count);
+}
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_debug_show, NULL);
@@ -219,6 +292,15 @@ static int __init init_sched_debug_procf
 
 	pe->proc_fops = &sched_debug_fops;
 
+#ifdef CONFIG_FAIR_USER_SCHED
+	pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->read_proc = root_user_share_read_proc;
+	pe->write_proc = root_user_share_write_proc;
+#endif
+
 	return 0;
 }
 
@@ -240,24 +322,23 @@ void proc_sched_show_task(struct task_st
 	SEQ_printf(m, "----------------------------------------------\n");
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+#define PN(F) \
+	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
-	P(se.wait_runtime);
-	P(se.wait_start_fair);
-	P(se.exec_start);
-	P(se.sleep_start_fair);
-	P(se.sum_exec_runtime);
+	PN(se.exec_start);
+	PN(se.vruntime);
+	PN(se.sum_exec_runtime);
 
 #ifdef CONFIG_SCHEDSTATS
-	P(se.wait_start);
-	P(se.sleep_start);
-	P(se.block_start);
-	P(se.sleep_max);
-	P(se.block_max);
-	P(se.exec_max);
-	P(se.wait_max);
-	P(se.wait_runtime_overruns);
-	P(se.wait_runtime_underruns);
-	P(se.sum_wait_runtime);
+	PN(se.wait_start);
+	PN(se.sleep_start);
+	PN(se.block_start);
+	PN(se.sleep_max);
+	PN(se.block_max);
+	PN(se.exec_max);
+	PN(se.slice_max);
+	PN(se.wait_max);
+	P(sched_info.bkl_count);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -265,6 +346,7 @@ void proc_sched_show_task(struct task_st
 	P(policy);
 	P(prio);
 #undef P
+#undef PN
 
 	{
 		u64 t0, t1;
@@ -279,9 +361,13 @@ void proc_sched_show_task(struct task_st
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
-	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+	p->se.sleep_max			= 0;
+	p->se.block_max			= 0;
+	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
+	p->se.wait_max			= 0;
+	p->sched_info.bkl_count		= 0;
 #endif
-	p->se.sum_exec_runtime = 0;
+	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 }
diff -puN kernel/sched_fair.c~git-sched kernel/sched_fair.c
--- a/kernel/sched_fair.c~git-sched
+++ a/kernel/sched_fair.c
@@ -34,13 +34,19 @@
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  * Targeted preemption latency for CPU-bound tasks:
  */
-unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
+const_debug unsigned int sysctl_sched_latency = 20000000ULL;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec, units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
+const_debug unsigned int sysctl_sched_nr_latency = 20;
 
 /*
  * sys_sched_yield() compat mode
@@ -66,7 +72,7 @@ unsigned int __read_mostly sysctl_sched_
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
+const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
@@ -76,34 +82,7 @@ unsigned int sysctl_sched_batch_wakeup_g
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
-
-unsigned int sysctl_sched_stat_granularity __read_mostly;
-
-/*
- * Initialized in sched_init_granularity() [to 5 times the base granularity]:
- */
-unsigned int sysctl_sched_runtime_limit __read_mostly;
-
-/*
- * Debugging: various feature bits
- */
-enum {
-	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_SLEEPER_AVG		= 2,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
-	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
-};
-
-unsigned int sysctl_sched_features __read_mostly =
-		SCHED_FEAT_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*0 |
-		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
-		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_SKIP_INITIAL		*0;
+const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
 
 extern struct sched_class fair_sched_class;
 
@@ -119,21 +98,9 @@ static inline struct rq *rq_of(struct cf
 	return cfs_rq->rq;
 }
 
-/* currently running entity (if any) on this cfs_rq */
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->curr;
-}
-
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->curr = se;
-}
-
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -141,21 +108,8 @@ static inline struct rq *rq_of(struct cf
 	return container_of(cfs_rq, struct rq, cfs);
 }
 
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	struct rq *rq = rq_of(cfs_rq);
-
-	if (unlikely(rq->curr->sched_class != &fair_sched_class))
-		return NULL;
-
-	return &rq->curr->se;
-}
-
 #define entity_is_task(se)	1
 
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct task_struct *task_of(struct sched_entity *se)
@@ -168,16 +122,42 @@ static inline struct task_struct *task_o
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline u64
+max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+	s64 delta = (s64)(vruntime - min_vruntime);
+	if (delta > 0)
+		min_vruntime = vruntime;
+
+	return min_vruntime;
+}
+
+static inline u64
+min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+	s64 delta = (s64)(vruntime - min_vruntime);
+	if (delta < 0)
+		min_vruntime = vruntime;
+
+	return min_vruntime;
+}
+
+static inline s64
+entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return se->vruntime - cfs_rq->min_vruntime;
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
-static inline void
+static void
 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
-	s64 key = se->fair_key;
+	s64 key = entity_key(cfs_rq, se);
 	int leftmost = 1;
 
 	/*
@@ -190,7 +170,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, 
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
-		if (key - entry->fair_key < 0) {
+		if (key < entity_key(cfs_rq, entry)) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
@@ -207,24 +187,15 @@ __enqueue_entity(struct cfs_rq *cfs_rq, 
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_add(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running++;
-	se->on_rq = 1;
-
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
-static inline void
+static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_sub(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running--;
-	se->on_rq = 0;
 
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -237,118 +208,59 @@ static struct sched_entity *__pick_next_
 	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 }
 
-/**************************************************************
- * Scheduling class statistics methods:
- */
-
-/*
- * Calculate the preemption granularity needed to schedule every
- * runnable task once per sysctl_sched_latency amount of time.
- * (down to a sensible low limit on granularity)
- *
- * For example, if there are 2 tasks running and latency is 10 msecs,
- * we switch tasks every 5 msecs. If we have 3 tasks running, we have
- * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
- * for each task. We do finer and finer scheduling up to until we
- * reach the minimum granularity value.
- *
- * To achieve this we use the following dynamic-granularity rule:
- *
- *    gran = lat/nr - lat/nr/nr
- *
- * This comes out of the following equations:
- *
- *    kA1 + gran = kB1
- *    kB2 + gran = kA2
- *    kA2 = kA1
- *    kB2 = kB1 - d + d/nr
- *    lat = d * nr
- *
- * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
- * '1' is start of time, '2' is end of time, 'd' is delay between
- * 1 and 2 (during which task B was running), 'nr' is number of tasks
- * running, 'lat' is the the period of each task. ('lat' is the
- * sched_latency that we aim for.)
- */
-static long
-sched_granularity(struct cfs_rq *cfs_rq)
+static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	unsigned int gran = sysctl_sched_latency;
-	unsigned int nr = cfs_rq->nr_running;
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *se = NULL;
+	struct rb_node *parent;
 
-	if (nr > 1) {
-		gran = gran/nr - gran/nr/nr;
-		gran = max(gran, sysctl_sched_min_granularity);
+	while (*link) {
+		parent = *link;
+		se = rb_entry(parent, struct sched_entity, run_node);
+		link = &parent->rb_right;
 	}
 
-	return gran;
+	return se;
 }
 
-/*
- * We rescale the rescheduling granularity of tasks according to their
- * nice level, but only linearly, not exponentially:
+/**************************************************************
+ * Scheduling class statistics methods:
  */
-static long
-niced_granularity(struct sched_entity *curr, unsigned long granularity)
+
+static u64 __sched_period(unsigned long nr_running)
 {
-	u64 tmp;
+	u64 period = sysctl_sched_latency;
+	unsigned long nr_latency = sysctl_sched_nr_latency;
 
-	if (likely(curr->load.weight == NICE_0_LOAD))
-		return granularity;
-	/*
-	 * Positive nice levels get the same granularity as nice-0:
-	 */
-	if (likely(curr->load.weight < NICE_0_LOAD)) {
-		tmp = curr->load.weight * (u64)granularity;
-		return (long) (tmp >> NICE_0_SHIFT);
+	if (unlikely(nr_running > nr_latency)) {
+		period *= nr_running;
+		do_div(period, nr_latency);
 	}
-	/*
-	 * Negative nice level tasks get linearly finer
-	 * granularity:
-	 */
-	tmp = curr->load.inv_weight * (u64)granularity;
 
-	/*
-	 * It will always fit into 'long':
-	 */
-	return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
+	return period;
 }
 
-static inline void
-limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	long limit = sysctl_sched_runtime_limit;
+	u64 period = __sched_period(cfs_rq->nr_running);
 
-	/*
-	 * Niced tasks have the same history dynamic range as
-	 * non-niced tasks:
-	 */
-	if (unlikely(se->wait_runtime > limit)) {
-		se->wait_runtime = limit;
-		schedstat_inc(se, wait_runtime_overruns);
-		schedstat_inc(cfs_rq, wait_runtime_overruns);
-	}
-	if (unlikely(se->wait_runtime < -limit)) {
-		se->wait_runtime = -limit;
-		schedstat_inc(se, wait_runtime_underruns);
-		schedstat_inc(cfs_rq, wait_runtime_underruns);
-	}
-}
+	period *= se->load.weight;
+	do_div(period, cfs_rq->load.weight);
 
-static inline void
-__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	se->wait_runtime += delta;
-	schedstat_add(se, sum_wait_runtime, delta);
-	limit_wait_runtime(cfs_rq, se);
+	return period;
 }
 
-static void
-add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
+static u64 __sched_vslice(unsigned long nr_running)
 {
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
-	__add_wait_runtime(cfs_rq, se, delta);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+	unsigned long period = sysctl_sched_latency;
+	unsigned long nr_latency = sysctl_sched_nr_latency;
+
+	if (unlikely(nr_running > nr_latency))
+		nr_running = nr_latency;
+
+	period /= nr_running;
+
+	return (u64)period;
 }
 
 /*
@@ -356,46 +268,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, 
  * are not in our scheduling class.
  */
 static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+	      unsigned long delta_exec)
 {
-	unsigned long delta, delta_exec, delta_fair, delta_mine;
-	struct load_weight *lw = &cfs_rq->load;
-	unsigned long load = lw->weight;
+	unsigned long delta_exec_weighted;
+	u64 vruntime;
 
-	delta_exec = curr->delta_exec;
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
-	cfs_rq->exec_clock += delta_exec;
-
-	if (unlikely(!load))
-		return;
-
-	delta_fair = calc_delta_fair(delta_exec, lw);
-	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
-
-	if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
-		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
-		delta = min(delta, (unsigned long)(
-			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
-		cfs_rq->sleeper_bonus -= delta;
-		delta_mine -= delta;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
 	}
+	curr->vruntime += delta_exec_weighted;
 
-	cfs_rq->fair_clock += delta_fair;
 	/*
-	 * We executed delta_exec amount of time on the CPU,
-	 * but we were only entitled to delta_mine amount of
-	 * time during that period (if nr_running == 1 then
-	 * the two values are equal)
-	 * [Note: delta_mine - delta_exec is negative]:
+	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
+	 * value tracking the leftmost vruntime in the tree.
 	 */
-	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
+	if (first_fair(cfs_rq)) {
+		vruntime = min_vruntime(curr->vruntime,
+				__pick_next_entity(cfs_rq)->vruntime);
+	} else
+		vruntime = curr->vruntime;
+
+	cfs_rq->min_vruntime =
+		max_vruntime(cfs_rq->min_vruntime, vruntime);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *curr = cfs_rq->curr;
+	u64 now = rq_of(cfs_rq)->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -406,123 +313,47 @@ static void update_curr(struct cfs_rq *c
 	 * since the last time we changed load (this cannot
 	 * overflow on 32 bits):
 	 */
-	delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
+	delta_exec = (unsigned long)(now - curr->exec_start);
 
-	curr->delta_exec += delta_exec;
-
-	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-		__update_curr(cfs_rq, curr);
-		curr->delta_exec = 0;
-	}
-	curr->exec_start = rq_of(cfs_rq)->clock;
+	__update_curr(cfs_rq, curr, delta_exec);
+	curr->exec_start = now;
 }
 
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	se->wait_start_fair = cfs_rq->fair_clock;
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
-/*
- * We calculate fair deltas here, so protect against the random effects
- * of a multiplication overflow by capping it to the runtime limit:
- */
-#if BITS_PER_LONG == 32
 static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
+calc_weighted(unsigned long delta, struct sched_entity *se)
 {
-	u64 tmp = (u64)delta * weight >> shift;
+	unsigned long weight = se->load.weight;
 
-	if (unlikely(tmp > sysctl_sched_runtime_limit*2))
-		return sysctl_sched_runtime_limit*2;
-	return tmp;
+	if (unlikely(weight != NICE_0_LOAD))
+		return (u64)delta * se->load.weight >> NICE_0_SHIFT;
+	else
+		return delta;
 }
-#else
-static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
-{
-	return delta * weight >> shift;
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 key;
-
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_start(cfs_rq, se);
-	/*
-	 * Update the key:
-	 */
-	key = cfs_rq->fair_clock;
-
-	/*
-	 * Optimize the common nice 0 case:
-	 */
-	if (likely(se->load.weight == NICE_0_LOAD)) {
-		key -= se->wait_runtime;
-	} else {
-		u64 tmp;
-
-		if (se->wait_runtime < 0) {
-			tmp = -se->wait_runtime;
-			key += (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		} else {
-			tmp = se->wait_runtime;
-			key -= (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		}
-	}
-
-	se->fair_key = key;
-}
-
-/*
- * Note: must be called with a freshly updated rq->fair_clock.
- */
-static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	unsigned long delta_fair = se->delta_fair_run;
-
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	add_wait_runtime(cfs_rq, se, delta_fair);
 }
 
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long delta_fair;
-
-	if (unlikely(!se->wait_start_fair))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
-
-	se->delta_fair_run += delta_fair;
-	if (unlikely(abs(se->delta_fair_run) >=
-				sysctl_sched_stat_granularity)) {
-		__update_stats_wait_end(cfs_rq, se);
-		se->delta_fair_run = 0;
-	}
-
-	se->wait_start_fair = 0;
+	schedstat_set(se->wait_max, max(se->wait_max,
+			rq_of(cfs_rq)->clock - se->wait_start));
 	schedstat_set(se->wait_start, 0);
 }
 
@@ -534,7 +365,7 @@ update_stats_dequeue(struct cfs_rq *cfs_
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 }
 
@@ -563,66 +394,24 @@ update_stats_curr_end(struct cfs_rq *cfs
  * Scheduling class queueing methods:
  */
 
-static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long load = cfs_rq->load.weight, delta_fair;
-	long prev_runtime;
-
-	/*
-	 * Do not boost sleepers if there's too much bonus 'in flight'
-	 * already:
-	 */
-	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
-		return;
-
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
-		load = rq_of(cfs_rq)->cpu_load[2];
-
-	delta_fair = se->delta_fair_sleep;
-
-	/*
-	 * Fix up delta_fair with the effect of us running
-	 * during the whole sleep period:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
-		delta_fair = div64_likely32((u64)delta_fair * load,
-						load + se->load.weight);
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	prev_runtime = se->wait_runtime;
-	__add_wait_runtime(cfs_rq, se, delta_fair);
-	delta_fair = se->wait_runtime - prev_runtime;
+	update_load_add(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running++;
+	se->on_rq = 1;
+}
 
-	/*
-	 * Track the amount of bonus we've given to sleepers:
-	 */
-	cfs_rq->sleeper_bonus += delta_fair;
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	update_load_sub(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running--;
+	se->on_rq = 0;
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct task_struct *tsk = task_of(se);
-	unsigned long delta_fair;
-
-	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
-
-	se->delta_fair_sleep += delta_fair;
-	if (unlikely(abs(se->delta_fair_sleep) >=
-				sysctl_sched_stat_granularity)) {
-		__enqueue_sleeper(cfs_rq, se);
-		se->delta_fair_sleep = 0;
-	}
-
-	se->sleep_start_fair = 0;
-
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -651,6 +440,49 @@ static void enqueue_sleeper(struct cfs_r
 #endif
 }
 
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+	s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+	if (d < 0)
+		d = -d;
+
+	if (d > 3*sysctl_sched_latency)
+		schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+	u64 vruntime;
+
+	vruntime = cfs_rq->min_vruntime;
+
+	if (sched_feat(USE_TREE_AVG)) {
+		struct sched_entity *last = __pick_last_entity(cfs_rq);
+		if (last) {
+			vruntime += last->vruntime;
+			vruntime >>= 1;
+		}
+	} else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
+		vruntime += __sched_vslice(cfs_rq->nr_running)/2;
+
+	if (initial && sched_feat(START_DEBIT))
+		vruntime += __sched_vslice(cfs_rq->nr_running + 1);
+
+	if (!initial) {
+		if (sched_feat(NEW_FAIR_SLEEPERS))
+			vruntime -= sysctl_sched_latency;
+
+		vruntime = max_t(s64, vruntime, se->vruntime);
+	}
+
+	se->vruntime = vruntime;
+
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 {
@@ -659,11 +491,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 	 */
 	update_curr(cfs_rq);
 
-	if (wakeup)
+	if (wakeup) {
+		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
+	}
 
 	update_stats_enqueue(cfs_rq, se);
-	__enqueue_entity(cfs_rq, se);
+	check_spread(cfs_rq, se);
+	if (se != cfs_rq->curr)
+		__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
 }
 
 static void
@@ -671,7 +508,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 {
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -683,60 +519,53 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
 		}
 #endif
 	}
-	__dequeue_entity(cfs_rq, se);
+
+	if (se != cfs_rq->curr)
+		__dequeue_entity(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
 }
 
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void
-__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			  struct sched_entity *curr, unsigned long granularity)
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	s64 __delta = curr->fair_key - se->fair_key;
 	unsigned long ideal_runtime, delta_exec;
 
-	/*
-	 * ideal_runtime is compared against sum_exec_runtime, which is
-	 * walltime, hence do not scale.
-	 */
-	ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
-			(unsigned long)sysctl_sched_min_granularity);
-
-	/*
-	 * If we executed more than what the latency constraint suggests,
-	 * reduce the rescheduling granularity. This way the total latency
-	 * of how much a task is not scheduled converges to
-	 * sysctl_sched_latency:
-	 */
+	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime)
-		granularity = 0;
-
-	/*
-	 * Take scheduling granularity into account - do not
-	 * preempt the current task unless the best task has
-	 * a larger than sched_granularity fairness advantage:
-	 *
-	 * scale granularity as key space is in fair_clock.
-	 */
-	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static inline void
+static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	/*
-	 * Any task has to be enqueued before it get to execute on
-	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue. (note, here we rely on pick_next_task() having
-	 * done a put_prev_task_fair() shortly before this, which
-	 * updated rq->fair_clock - used by update_stats_wait_end())
-	 */
-	update_stats_wait_end(cfs_rq, se);
+	/* 'current' is not kept within the tree. */
+	if (se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		update_stats_wait_end(cfs_rq, se);
+		__dequeue_entity(cfs_rq, se);
+	}
+
 	update_stats_curr_start(cfs_rq, se);
-	set_cfs_rq_curr(cfs_rq, se);
+	cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+		se->slice_max = max(se->slice_max,
+			se->sum_exec_runtime - se->prev_sum_exec_runtime);
+	}
+#endif
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
@@ -760,31 +589,24 @@ static void put_prev_entity(struct cfs_r
 
 	update_stats_curr_end(cfs_rq, prev);
 
-	if (prev->on_rq)
+	check_spread(cfs_rq, prev);
+	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
-	set_cfs_rq_curr(cfs_rq, NULL);
+		/* Put 'current' back into the tree. */
+		__enqueue_entity(cfs_rq, prev);
+	}
+	cfs_rq->curr = NULL;
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *next;
-
-	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
-	 */
-	dequeue_entity(cfs_rq, curr, 0);
-	enqueue_entity(cfs_rq, curr, 0);
-
 	/*
-	 * Reschedule if another task tops the current one.
+	 * Update run-time statistics of the 'current'.
 	 */
-	next = __pick_next_entity(cfs_rq);
-	if (next == curr)
-		return;
+	update_curr(cfs_rq);
 
-	__check_preempt_curr_fair(cfs_rq, next, curr,
-			sched_granularity(cfs_rq));
+	if (cfs_rq->nr_running > 1)
+		check_preempt_tick(cfs_rq, curr);
 }
 
 /**************************************************
@@ -819,23 +641,28 @@ static inline struct cfs_rq *group_cfs_r
  */
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
-	/* A later patch will take group into account */
-	return &cpu_rq(this_cpu)->cfs;
+	return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
-/* Do the two (enqueued) tasks belong to the same group ? */
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	if (curr->se.cfs_rq == p->se.cfs_rq)
+	if (se->cfs_rq == pse->cfs_rq)
 		return 1;
 
 	return 0;
 }
 
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return se->parent;
+}
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -868,11 +695,17 @@ static inline struct cfs_rq *cpu_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	return 1;
 }
 
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return NULL;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 /*
@@ -890,6 +723,7 @@ static void enqueue_task_fair(struct rq 
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
+		wakeup = 1;
 	}
 }
 
@@ -909,6 +743,7 @@ static void dequeue_task_fair(struct rq 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
+		sleep = 1;
 	}
 }
 
@@ -917,12 +752,10 @@ static void dequeue_task_fair(struct rq 
  *
  * If compat_yield is turned on then we requeue to the end of the tree.
  */
-static void yield_task_fair(struct rq *rq, struct task_struct *p)
+static void yield_task_fair(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct sched_entity *rightmost, *se = &p->se;
-	struct rb_node *parent;
+	struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
+	struct sched_entity *rightmost, *se = &rq->curr->se;
 
 	/*
 	 * Are we the only task in the tree?
@@ -936,49 +769,37 @@ static void yield_task_fair(struct rq *r
 		 * Dequeue and enqueue the task to update its
 		 * position within the tree:
 		 */
-		dequeue_entity(cfs_rq, &p->se, 0);
-		enqueue_entity(cfs_rq, &p->se, 0);
+		update_curr(cfs_rq);
 
 		return;
 	}
 	/*
 	 * Find the rightmost entry in the rbtree:
 	 */
-	do {
-		parent = *link;
-		link = &parent->rb_right;
-	} while (*link);
-
-	rightmost = rb_entry(parent, struct sched_entity, run_node);
+	rightmost = __pick_last_entity(cfs_rq);
 	/*
 	 * Already in the rightmost position?
 	 */
-	if (unlikely(rightmost == se))
+	if (unlikely(rightmost->vruntime < se->vruntime))
 		return;
 
 	/*
 	 * Minimally necessary key value to be last in the tree:
+	 * Upon rescheduling, sched_class::put_prev_task() will place
+	 * 'current' within the tree based on its new key value.
 	 */
-	se->fair_key = rightmost->fair_key + 1;
-
-	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-	/*
-	 * Relink the task to the rightmost position:
-	 */
-	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	rb_link_node(&se->run_node, parent, link);
-	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+	se->vruntime = rightmost->vruntime + 1;
 }
 
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	unsigned long gran;
+	struct sched_entity *se = &curr->se, *pse = &p->se;
+	s64 delta;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -987,15 +808,15 @@ static void check_preempt_curr_fair(stru
 		return;
 	}
 
-	gran = sysctl_sched_wakeup_granularity;
-	/*
-	 * Batch tasks prefer throughput over latency:
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		gran = sysctl_sched_batch_wakeup_granularity;
+	while (!is_same_group(se, pse)) {
+		se = parent_entity(se);
+		pse = parent_entity(pse);
+	}
 
-	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+	delta = se->vruntime - pse->vruntime;
+
+	if (delta > (s64)sysctl_sched_wakeup_granularity)
+		resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1076,7 +897,10 @@ static int cfs_rq_best_prio(struct cfs_r
 	if (!cfs_rq->nr_running)
 		return MAX_PRIO;
 
-	curr = __pick_next_entity(cfs_rq);
+	curr = cfs_rq->curr;
+	if (!curr)
+		curr = __pick_next_entity(cfs_rq);
+
 	p = task_of(curr);
 
 	return p->prio;
@@ -1151,6 +975,8 @@ static void task_tick_fair(struct rq *rq
 	}
 }
 
+#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
+
 /*
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
@@ -1161,37 +987,30 @@ static void task_tick_fair(struct rq *rq
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
-	update_stats_enqueue(cfs_rq, se);
-	/*
-	 * Child runs first: we let it run before the parent
-	 * until it reschedules once. We set up the key so that
-	 * it will preempt the parent:
-	 */
-	se->fair_key = curr->fair_key -
-		niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
-	/*
-	 * The first wait is dominated by the child-runs-first logic,
-	 * so do not credit it with that waiting time yet:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
-		se->wait_start_fair = 0;
+	place_entity(cfs_rq, se, 1);
 
-	/*
-	 * The statistical average of wait_runtime is about
-	 * -granularity/2, so initialize the task with that:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
+	if (sysctl_sched_child_runs_first &&
+			curr->vruntime < se->vruntime) {
+		/*
+		 * Upon rescheduling, sched_class::put_prev_task() will place
+		 * 'current' within the tree based on its new key value.
+		 */
+		swap(curr->vruntime, se->vruntime);
+	}
 
+	update_stats_enqueue(cfs_rq, se);
+	check_spread(cfs_rq, se);
+	check_spread(cfs_rq, curr);
 	__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
+	resched_task(rq->curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Account for a task changing its policy or group.
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1204,11 +1023,6 @@ static void set_curr_task_fair(struct rq
 	for_each_sched_entity(se)
 		set_next_entity(cfs_rq_of(se), se);
 }
-#else
-static void set_curr_task_fair(struct rq *rq)
-{
-}
-#endif
 
 /*
  * All the scheduling class methods:
@@ -1218,7 +1032,7 @@ struct sched_class fair_sched_class __re
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 
-	.check_preempt_curr	= check_preempt_curr_fair,
+	.check_preempt_curr	= check_preempt_wakeup,
 
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
@@ -1235,6 +1049,9 @@ static void print_cfs_stats(struct seq_f
 {
 	struct cfs_rq *cfs_rq;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
+#endif
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 }
diff -puN kernel/sched_idletask.c~git-sched kernel/sched_idletask.c
--- a/kernel/sched_idletask.c~git-sched
+++ a/kernel/sched_idletask.c
@@ -50,6 +50,10 @@ static void task_tick_idle(struct rq *rq
 {
 }
 
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
@@ -66,6 +70,7 @@ static struct sched_class idle_sched_cla
 
 	.load_balance		= load_balance_idle,
 
+	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
 };
diff -puN kernel/sched_rt.c~git-sched kernel/sched_rt.c
--- a/kernel/sched_rt.c~git-sched
+++ a/kernel/sched_rt.c
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *r
 }
 
 static void
-yield_task_rt(struct rq *rq, struct task_struct *p)
+yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, p);
+	requeue_task_rt(rq, rq->curr);
 }
 
 /*
@@ -218,6 +218,13 @@ static void task_tick_rt(struct rq *rq, 
 	}
 }
 
+static void set_curr_task_rt(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	p->se.exec_start = rq->clock;
+}
+
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -230,5 +237,6 @@ static struct sched_class rt_sched_class
 
 	.load_balance		= load_balance_rt,
 
+	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 };
diff -puN kernel/sched_stats.h~git-sched kernel/sched_stats.h
--- a/kernel/sched_stats.h~git-sched
+++ a/kernel/sched_stats.h
@@ -16,18 +16,18 @@ static int show_schedstat(struct seq_fil
 		struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
 		struct sched_domain *sd;
-		int dcnt = 0;
+		int dcount = 0;
 #endif
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
 		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu %llu %lu",
 		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
-		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
-		    rq->ttwu_cnt, rq->ttwu_local,
+		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
 
@@ -39,12 +39,12 @@ static int show_schedstat(struct seq_fil
 			char mask_str[NR_CPUS];
 
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
-			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
 						"%lu",
-				    sd->lb_cnt[itype],
+				    sd->lb_count[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
@@ -55,9 +55,9 @@ static int show_schedstat(struct seq_fil
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
 			    " %lu %lu %lu\n",
-			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
-			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
+			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
 			    sd->ttwu_move_balance);
 		}
@@ -101,7 +101,7 @@ rq_sched_info_arrive(struct rq *rq, unsi
 {
 	if (rq) {
 		rq->rq_sched_info.run_delay += delta;
-		rq->rq_sched_info.pcnt++;
+		rq->rq_sched_info.pcount++;
 	}
 }
 
@@ -129,7 +129,7 @@ rq_sched_info_depart(struct rq *rq, unsi
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -164,7 +164,7 @@ static void sched_info_arrive(struct tas
 	sched_info_dequeued(t);
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
-	t->sched_info.pcnt++;
+	t->sched_info.pcount++;
 
 	rq_sched_info_arrive(task_rq(t), delta);
 }
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *pr
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHEDSTATS */
 
diff -puN kernel/sysctl.c~git-sched kernel/sysctl.c
--- a/kernel/sysctl.c~git-sched
+++ a/kernel/sysctl.c
@@ -222,14 +222,11 @@ static ctl_table kern_table[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_min_granularity_ns",
-		.data		= &sysctl_sched_min_granularity,
+		.procname	= "sched_nr_latency",
+		.data		= &sysctl_sched_nr_latency,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -266,28 +263,6 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_stat_granularity_ns",
-		.data		= &sysctl_sched_stat_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_runtime_limit_ns",
-		.data		= &sysctl_sched_runtime_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),
diff -puN kernel/user.c~git-sched kernel/user.c
--- a/kernel/user.c~git-sched
+++ a/kernel/user.c
@@ -50,8 +50,41 @@ struct user_struct root_user = {
 	.uid_keyring	= &root_user_keyring,
 	.session_keyring = &root_session_keyring,
 #endif
+#ifdef CONFIG_FAIR_USER_SCHED
+	.tg		= &init_task_grp,
+#endif
 };
 
+#ifdef CONFIG_FAIR_USER_SCHED
+static void sched_destroy_user(struct user_struct *up)
+{
+	sched_destroy_group(up->tg);
+}
+
+static int sched_create_user(struct user_struct *up)
+{
+	int rc = 0;
+
+	up->tg = sched_create_group();
+	if (IS_ERR(up->tg))
+		rc = -ENOMEM;
+
+	return rc;
+}
+
+static void sched_switch_user(struct task_struct *p)
+{
+	sched_move_task(p);
+}
+
+#else	/* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_USER_SCHED */
+
 /*
  * These routines must be called with the uidhash spinlock held!
  */
@@ -109,6 +142,7 @@ void free_uid(struct user_struct *up)
 	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		spin_unlock_irqrestore(&uidhash_lock, flags);
+		sched_destroy_user(up);
 		key_put(up->uid_keyring);
 		key_put(up->session_keyring);
 		kmem_cache_free(uid_cachep, up);
@@ -150,6 +184,13 @@ struct user_struct * alloc_uid(struct us
 			return NULL;
 		}
 
+		if (sched_create_user(new) < 0) {
+			key_put(new->uid_keyring);
+			key_put(new->session_keyring);
+			kmem_cache_free(uid_cachep, new);
+			return NULL;
+		}
+
 		/*
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
@@ -157,6 +198,7 @@ struct user_struct * alloc_uid(struct us
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
+			sched_destroy_user(new);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
@@ -184,6 +226,7 @@ void switch_uid(struct user_struct *new_
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
 	current->user = new_user;
+	sched_switch_user(current);
 
 	/*
 	 * We need to synchronize with __sigqueue_alloc()
_