GIT 50619554d80485e0ab6d99de0e11cb9885224efc git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched.git

commit 
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:34 2007 +0200

    sched: kernel/sched_fair.c whitespace cleanups
    
    some trivial whitespace cleanups.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit e371e5374a8adc294631c1aaa9bfcdff00246bc4
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:34 2007 +0200

    sched: enhance debug output
    
    enhance debug output by changing 12345678 nsecs to 12.345678 output,
    this is more human-readable.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit a5b01a0c82a1bb54eac40e3713d1d6224f0e4e31
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:34 2007 +0200

    sched: prettify /proc/sched_debug output
    
    print the correct amount of dashes in /proc/sched_debug.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 8aa819407e23f579e24ce8f8d3e20649e890de88
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:34 2007 +0200

    sched: rework enqueue/dequeue_entity() to get rid of set_curr_task()
    
    rework enqueue/dequeue_entity() to get rid of
    sched_class::set_curr_task(). This simplifies sched_setscheduler(),
    rt_mutex_setprio() and sched_move_tasks().
    
       text    data     bss     dec     hex filename
      24330    2734      20   27084    69cc sched.o.before
      24233    2730      20   26983    6967 sched.o.after
    
    Signed-off-by : Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit b6dc321247d78cb97082676e174a28dd61a66755
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:34 2007 +0200

    sched: simplify sched_class::yield_task()
    
    the 'p' (task_struct) parameter in the sched_class :: yield_task() is
    redundant as the caller is always the 'current'. Get rid of it.
    
       text    data     bss     dec     hex filename
      24341    2734      20   27095    69d7 sched.o.before
      24330    2734      20   27084    69cc sched.o.after
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit ab630c090b49b75668fe6fc2a15bed43d8248669
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: optimize task_new_fair()
    
    due to the fact that we no longer keep the 'current' within the tree,
    dequeue/enqueue_entity() is useless for the 'current' in
    task_new_fair(). We are about to reschedule and
    sched_class->put_prev_task() will put the 'current' back into the tree,
    based on its new key.
    
       text    data     bss     dec     hex filename
      24388    2734      20   27142    6a06 sched.o.before
      24341    2734      20   27095    69d7 sched.o.after
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit ae71287b651eeee369e726033d2edb5f46a220c9
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: disable START_DEBIT
    
    disable START_DEBIT - lets see how well we go with higher
    parallelism.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit cae3997cc3f86bae0253ce0b4d98176f7064726c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: fix delay accounting performance regression
    
    fix delay accounting performance regression - those sched_clock()
    calls are not needed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f5977d46a967a9e00c03708a1a60c3aef7bc0d15
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: do not keep current in the tree and get rid of sched_entity::fair_key
    
    Get rid of 'sched_entity::fair_key'.
    
    As a side effect, 'current' is not kept withing the tree for
    SCHED_NORMAL/BATCH tasks anymore. This simplifies some parts of code
    (e.g. entity_tick() and yield_task_fair()) and also somewhat optimizes
    them (e.g. a single update_curr() now vs. dequeue/enqueue() before in
    entity_tick()).
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 11d3d708d18f91b55f24beabba8d0c7967bbd08f
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: add set_curr_task() calls
    
    p->sched_class->set_curr_task() has to be called before
    activate_task()/enqueue_task() in rt_mutex_setprio(),
    sched_setschedule() and sched_move_task() in order to set up
    'cfs_rq->curr'. The logic of enqueueing depends on whether a task to be
    inserted is 'current' or not.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 109cbc28e636806dd65aeb5107c7381902451d02
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: sched_setscheduler() fix
    
    Fix a problem in the 'sched-group' patch for !CONFIG_FAIR_GROUP_SCHED.
    
    description:
    
    sched_setscheduler()
    {
    ...
    if (task_running()) p->sched_class->put_prev_entity();
    
    [ this one sets up cfs_rq->curr to NULL ]
    
    ...
    
    if (task_running) p->sched_class->set_curr_task();
    
    [ and this one is a _NOP_ (empty) for !CONFIG_FAIR_GROUP_SCHED ]
    
    As a result, the task continues to run with cfs_rq->curr == NULL... no
    crashes (due to checks for !NULL in place) but e.g. update_curr()
    effectively becomes a NOP... i.e. runtime statistics for this task is
    not accounted untill it's rescheduled anew.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 1a877dc0ec5b9c84b5d9d956c64ebfe5eb9dea7d
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: group-scheduler core
    
    Add interface to control cpu bandwidth allocation to task-groups.
    
    (not yet configurable, due to missing CONFIG_CONTAINERS)
    
    Signed-off-by : Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by : Dhaval Giani <dhaval@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit ba92ee1d027a597d6bd0d4c99a0d54557b414eb0
Author: Mike Galbraith <efault@gmx.de>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: fix SMP migration latencies
    
    fix SMP migration latencies.
    
    Signed-off-by: Mike Galbraith <efault@gmx.de>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

commit 09fd52817bec0464283ec29ec731e64212577975
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:33 2007 +0200

    sched: better min_vruntime tracking
    
    Better min_vruntime tracking.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 42fbed8117c0f762d4736b16426df5cbb44ec5a2
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: x86: allow single-depth wchan output
    
    sched.o gets smaller and faster if we compile it with -fomit-frame-pointers,
    so make this a config option. The cost is the loss of multi-depth wchan
    lookups - but SysRq-T is a sufficient replacement for them anyway, so their
    utility is much lower these days.
    
    the size difference is significant:
    
       text    data     bss     dec     hex filename
      34005    3462      24   37491    9273 sched.o.before
      33470    3462      24   36956    905c sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit aa4a575ded959d2bdeed58a0344eaeaa49e0914c
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: clean up schedstat block in dequeue_entity()
    
    Better placement of #ifdef CONFIG_SCHEDSTAT block in dequeue_entity().
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit d487bab0d09da1c555b3167dedc943f4d5ad8834
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: remove wait_runtime fields and features
    
    remove wait_runtime based fields and features.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 49b72e4a0942736a989b366a198750fadf18a699
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: remove wait_runtime limit
    
    remove the wait_runtime-limit fields.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 60092629712ea7a97d7fa116b03bc02469a7cc66
Author: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: clean up struct load_stat
    
    'struct load_stat' is redundant now so let's get rid of it.
    
    Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 87f94149ce4dd8cbbe6b5df8fc60ffad6bbb1a0b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: debug: update exec_clock only when SCHED_DEBUG
    
    micro-optimization: update cfs_rq->exec_clock only if
    CONFIG_SCHED_DEBUG=y.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit fb48b10da06041d925d2ed00d30ae39faabe65fa
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: add more vruntime statistics
    
    add more vruntime statistics.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit a14488f7405e9a8a1fed0e8bcd7009aa6b60bc49
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: sync up ->min_vruntime when going idle
    
    sync up ->min_vruntime when going idle.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit bbb2e2c189227f17d9a1d05b06097be0ae2eb7e0
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: handle vruntime overflow
    
    Handle vruntime overflow by centering the key space around min_vruntime.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit a210f82439d2452668a8315979ffd97e7e5b6356
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: add tree based averages
    
    add support for tree based vruntime averages.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 9a392cb97971fd631c8ce10368d0f8606a163a6c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: add se->vruntime debugging
    
    debug se->vruntime fields.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 12048e325da722a768b2508d59033a77e4c0862f
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:32 2007 +0200

    sched: clean up new task placement
    
    clean up new task placement.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 52809d1c1a9a43667195132a4d25dcf7d3119afc
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: wakeup granularity fix
    
    fix wakeup granularity. (and increase the default)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit a23004c9f54a8c2257a272fd1a31705f3350b807
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: simplify check_preempt() methods
    
    simplify the check_preempt() methods.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 6e8ebdd6a77c3f01359e73c15052896351b80475
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: simplify adaptive latency
    
    simplify adaptive latency.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 3e73017a14196fa73817597143cfcf1fd3020f64
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: new task placement for vruntime
    
    add proper new task placement for the vruntime based math too.
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit bc8470959656430b86986dc053e2a74cea928fb3
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: optimize vruntime based scheduling
    
    optimize vruntime based scheduling.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit e09f1395e264931b387b9ffeacfdcccde336fa25
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: move sched_feat() definitions
    
    move sched_feat() definitions so that it can be used sooner by generic
    code too.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit ea2e10f362e546545b9d10f27b470ecdf5075c8e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: introduce se->vruntime
    
    introduce se->vruntime as a sum of weighted delta-exec's, and use that
    as the key into the tree.
    
    the idea to use absolute virtual time as the basic metric of scheduling
    has been first raised by William Lee Irwin, advanced by Tong Li and first
    prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset.
    
    also see:
    
       http://lkml.org/lkml/2007/9/2/76
    
    for a simpler variant of this patch.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 86ba767535c182cacc5747be920de86cf20d7341
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: clean up calc_weighted()
    
    clean up calc_weighted() - we always use the normalized shift so
    it's not needed to pass that in. Also, push the non-nice0 branch
    into the function.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 33d2e73cc4ec7a8459f800a470aab394245d3169
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: speed up update_load_add/_sub()
    
    speed up update_load_add/_sub() by not delaying the division - this
    reduces CPU pipeline dependencies.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 4d82618363e5c7cc5c8a38d37de54ca2779b2a2d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: uninline __enqueue_entity()/__dequeue_entity()
    
    suggested by Roman Zippel: uninline __enqueue_entity() and
    __dequeue_entity().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 429924d5f0dabdf1fcb33a78e6a182f69367fedd
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date:   Thu Sep 20 09:49:31 2007 +0200

    sched: simplify SCHED_FEAT_* code
    
    Peter Zijlstra suggested to simplify SCHED_FEAT_* checks via the
    sched_feat(x) macro.
    
    No code changed:
    
       text    data     bss     dec     hex filename
       38895    3550      24   42469    a5e5 sched.o.before
       38895    3550      24   42469    a5e5 sched.o.after
    
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 6d7faa79a1e5cc8b1a678dd0dad9cc1ca67faa96
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: cleanup: simplify cfs_rq_curr() methods
    
    cleanup: simplify cfs_rq_curr() methods - now that the cfs_rq->curr
    pointer is unconditionally present, remove the wrappers.
    
      kernel/sched.o:
          text    data     bss     dec     hex filename
         11784     224    2012   14020    36c4 sched.o.before
         11784     224    2012   14020    36c4 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit a3ad365ce063087c598734acffad07edd1844b40
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: track cfs_rq->curr on !group-scheduling too
    
    Noticed by Roman Zippel: use cfs_rq->curr in the !group-scheduling
    case too. Small micro-optimization and cleanup effect:
    
       text    data     bss     dec     hex filename
       36269    3482      24   39775    9b5f sched.o.before
       36177    3486      24   39687    9b07 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit f915733a880cafff73cdd0d910f1628a1c17422f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: remove precise CPU load calculations #2
    
    continued removal of precise CPU load calculations.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 8a551e56405b3e23991e8f77934166d6243c3ec7
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: remove precise CPU load
    
    CPU load calculations are statistical anyway, and there's little benefit
    from having it calculated on every scheduling event. So remove this code,
    it gets rid of a divide from the scheduler wakeup and context-switch
    fastpath.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 5e100b83e3ffce9b0d800c9a1eedbc31aea26af7
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: remove stat_gran
    
    remove the stat_gran code - it was disabled by default and it causes
    unnecessary overhead.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit b933dafc464ecc9bd46511b02f2fcce6ede58c7d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: use constants if !CONFIG_SCHED_DEBUG
    
    use constants if !CONFIG_SCHED_DEBUG.
    
    this speeds up the code and reduces code-size:
    
        text    data     bss     dec     hex filename
       27464    3014      16   30494    771e sched.o.before
       26929    3010      20   29959    7507 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 5a1d55cf681f5fd6cb1cf7243c67f983c56ae50c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: uniform tunings
    
    use uniform tunings on UP and SMP alike.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 5e5d438ac07f8acc5cc33d69b81e56d949b3bc04
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: debug: track maximum 'slice'
    
    track the maximum amount of time a task has executed while
    the CPU load was at least 2x. (i.e. at least two nice-0
    tasks were runnable)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 89a049dfb7db127b6aed5d66059c47510df6499a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: small sched_debug cleanup
    
    small kernel/sched_debug.c cleanup - break up
    multi-variable assignment.
    
    no code changed:
    
       text    data     bss     dec     hex filename
       38869    3550      24   42443    a5cb sched.o.before
       38869    3550      24   42443    a5cb sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 76e1996995dd6ebcc6a5b6ebd92a0bb40d77c809
Author: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: use list_for_each_entry_safe() in __wake_up_common()
    
    Use list_for_each_entry_safe() instead of list_for_each_safe() in
    __wake_up_common()
    
    Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit cb74a6ca78be6c8ce037da5a8d654da36d2dca7c
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:30 2007 +0200

    sched: resched task in task_new_fair()
    
    to get full child-runs-first semantics make sure the parent is
    rescheduled.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>

commit 34060d7f2724ae7c206e6475be8da29656aab021
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Sep 20 09:49:29 2007 +0200

    sched: fix new-task method
    
    always call into ->task_new().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
    Signed-off-by: Mike Galbraith <efault@gmx.de>
 arch/i386/Kconfig     |   11 +
 include/linux/sched.h |   24 --
 init/Kconfig          |    9 +
 kernel/sched.c        |  544 ++++++++++++++++++++++++++------------
 kernel/sched_debug.c  |  177 +++++++-----
 kernel/sched_fair.c   |  702 ++++++++++++++++++-------------------------------
 kernel/sched_rt.c     |    4 
 kernel/sched_stats.h  |    4 
 kernel/sysctl.c       |   22 --
 9 files changed, 749 insertions(+), 748 deletions(-)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 97b64d7..c9fc6a8 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -214,6 +214,17 @@ config X86_ES7000
 
 endchoice
 
+config SCHED_NO_NO_OMIT_FRAME_POINTER
+	bool "Single-depth WCHAN output"
+	default y
+	help
+	  Calculate simpler /proc/<PID>/wchan values. If this option
+	  is disabled then wchan values will recurse back to the
+	  caller function. This provides more accurate wchan values,
+	  at the expense of slightly more scheduling overhead.
+
+	  If in doubt, say "Y".
+
 config PARAVIRT
 	bool "Paravirtualization support (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a01ac6d..78f215a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -857,7 +857,7 @@ struct sched_class {
 
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
-	void (*yield_task) (struct rq *rq, struct task_struct *p);
+	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
@@ -870,7 +870,6 @@ struct sched_class {
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *all_pinned, int *this_best_prio);
 
-	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
@@ -887,31 +886,21 @@ struct load_weight {
  *     4 se->block_start
  *     4 se->run_node
  *     4 se->sleep_start
- *     4 se->sleep_start_fair
  *     6 se->load.weight
- *     7 se->delta_fair
- *    15 se->wait_runtime
  */
 struct sched_entity {
-	long			wait_runtime;
-	unsigned long		delta_fair_run;
-	unsigned long		delta_fair_sleep;
-	unsigned long		delta_exec;
-	s64			fair_key;
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	unsigned int		on_rq;
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
+	u64			vruntime;
 	u64			prev_sum_exec_runtime;
-	u64			wait_start_fair;
-	u64			sleep_start_fair;
 
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
-	s64			sum_wait_runtime;
 
 	u64			sleep_start;
 	u64			sleep_max;
@@ -920,9 +909,7 @@ #ifdef CONFIG_SCHEDSTATS
 	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
-
-	unsigned long		wait_runtime_overruns;
-	unsigned long		wait_runtime_underruns;
+	u64			slice_max;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1400,15 +1387,18 @@ #endif
 
 extern void sched_idle_next(void);
 
+#ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
-extern unsigned int sysctl_sched_compat_yield;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
+#endif
+
+extern unsigned int sysctl_sched_compat_yield;
 
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig
index d54d0ca..11c6762 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -281,6 +281,15 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config FAIR_GROUP_SCHED
+	bool "Fair group scheduler"
+	depends on EXPERIMENTAL && CONTAINERS
+	help
+	  This option enables you to group tasks and control CPU resource
+	  allocation to such groups.
+
+	  Say N if unsure.
+
 config SYSFS_DEPRECATED
 	bool "Create deprecated sysfs files"
 	default y
diff --git a/kernel/sched.c b/kernel/sched.c
index 6107a0c..c93ae30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -170,31 +170,75 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
-struct load_stat {
-	struct load_weight load;
-	u64 load_update_start, load_update_last;
-	unsigned long delta_fair, delta_exec, delta_stat;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/container.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+	struct container_subsys_state css;
+	/* schedulable entities of this group on each cpu */
+	struct sched_entity **se;
+	/* runqueue "owned" by this group on each cpu */
+	struct cfs_rq **cfs_rq;
+	unsigned long shares;
 };
 
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+
+/* Default task group.
+ * 	Every task in system belong to this group at bootup.
+ */
+static struct task_grp init_task_grp =  {
+					.se     = init_sched_entity_p,
+					.cfs_rq = init_cfs_rq_p,
+					};
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, cpu_subsys_id),
+				struct task_grp, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+	p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
-	s64 fair_clock;
 	u64 exec_clock;
-	s64 wait_runtime;
-	u64 sleeper_bonus;
-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+	u64 min_vruntime;
+	unsigned long nr_sync_min_vruntime;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
 	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -205,6 +249,7 @@ #ifdef CONFIG_FAIR_GROUP_SCHED
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+	struct task_grp *tg;    /* group that "owns" this runqueue */
 #endif
 };
 
@@ -236,7 +281,7 @@ struct rq {
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
-	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
+	struct load_weight load;	/* capture load from *all* tasks on this cpu */
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 
@@ -382,6 +427,33 @@ #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
 /*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_START_DEBIT		= 2,
+	SCHED_FEAT_USE_TREE_AVG         = 4,
+	SCHED_FEAT_APPROX_AVG           = 8,
+};
+
+const_debug unsigned int sysctl_sched_features =
+		SCHED_FEAT_NEW_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_START_DEBIT		*0 |
+		SCHED_FEAT_USE_TREE_AVG		*0 |
+		SCHED_FEAT_APPROX_AVG		*0;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
+/*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
@@ -400,18 +472,6 @@ unsigned long long cpu_clock(int cpu)
 	return now;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-	p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -644,19 +704,6 @@ static inline void resched_task(struct t
 }
 #endif
 
-static u64 div64_likely32(u64 divident, unsigned long divisor)
-{
-#if BITS_PER_LONG == 32
-	if (likely(divident <= 0xffffffffULL))
-		return (u32)divident / divisor;
-	do_div(divident, divisor);
-
-	return divident;
-#else
-	return divident / divisor;
-#endif
-}
-
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
@@ -698,16 +745,14 @@ calc_delta_fair(unsigned long delta_exec
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
 
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-	lw->inv_weight = 0;
 }
 
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
-	lw->inv_weight = 0;
 }
 
 /*
@@ -792,20 +837,11 @@ #endif
 
 #define sched_class_highest (&rt_sched_class)
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
 /*
  * Update delta_exec, delta_fair fields for rq.
  *
  * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
+ * total load (rq->load.weight) on the runqueue, while
  * delta_exec advances at the same rate as wall-clock (provided
  * cpu is not idle).
  *
@@ -813,35 +849,17 @@ static void __update_curr_load(struct rq
  * runqueue over any given interval. This (smoothened) load is used
  * during load balance.
  *
- * This function is called /before/ updating rq->ls.load
+ * This function is called /before/ updating rq->load
  * and when switching tasks.
  */
-static void update_curr_load(struct rq *rq)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = rq->clock;
-	ls->delta_stat += rq->clock - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
-}
-
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
-	update_load_add(&rq->ls.load, p->se.load.weight);
+	update_load_add(&rq->load, p->se.load.weight);
 }
 
 static inline void dec_load(struct rq *rq, const struct task_struct *p)
 {
-	update_curr_load(rq);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
+	update_load_sub(&rq->load, p->se.load.weight);
 }
 
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -858,8 +876,6 @@ static void dec_nr_running(struct task_s
 
 static void set_load_weight(struct task_struct *p)
 {
-	p->se.wait_runtime = 0;
-
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -988,7 +1004,7 @@ inline int task_curr(const struct task_s
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->ls.load.weight;
+	return cpu_rq(cpu)->load.weight;
 }
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
@@ -1005,15 +1021,9 @@ void set_task_cpu(struct task_struct *p,
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-	u64 clock_offset, fair_clock_offset;
+	u64 clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
-	if (p->se.wait_start_fair)
-		p->se.wait_start_fair -= fair_clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
@@ -1023,6 +1033,9 @@ #ifdef CONFIG_SCHEDSTATS
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
+	if (likely(new_rq->cfs.min_vruntime))
+		p->se.vruntime -= old_rq->cfs.min_vruntime -
+						new_rq->cfs.min_vruntime;
 
 	__set_task_cpu(p, new_cpu);
 }
@@ -1583,28 +1596,20 @@ int fastcall wake_up_state(struct task_s
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.delta_exec		= 0;
-	p->se.delta_fair_run		= 0;
-	p->se.delta_fair_sleep		= 0;
-	p->se.wait_runtime		= 0;
-	p->se.sleep_start_fair		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
-	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
-	p->se.wait_runtime_overruns	= 0;
-	p->se.wait_runtime_underruns	= 0;
 #endif
 
 	INIT_LIST_HEAD(&p->run_list);
@@ -1657,12 +1662,6 @@ #endif
 }
 
 /*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
- */
-unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
-
-/*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
@@ -1687,10 +1686,8 @@ void fastcall wake_up_new_task(struct ta
 	else
 		p->sched_class = &fair_sched_class;
 
-	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
-			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
-			!current->se.on_rq) {
-
+	if (task_cpu(p) != this_cpu || !p->sched_class->task_new ||
+							!current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
@@ -1981,42 +1978,10 @@ unsigned long nr_active(void)
  */
 static void update_cpu_load(struct rq *this_rq)
 {
-	u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
-	unsigned long total_load = this_rq->ls.load.weight;
-	unsigned long this_load =  total_load;
-	struct load_stat *ls = &this_rq->ls;
+	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 
 	this_rq->nr_load_updates++;
-	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
-		goto do_avg;
-
-	/* Update delta_fair/delta_exec fields first */
-	update_curr_load(this_rq);
-
-	fair_delta64 = ls->delta_fair + 1;
-	ls->delta_fair = 0;
-
-	exec_delta64 = ls->delta_exec + 1;
-	ls->delta_exec = 0;
-
-	sample_interval64 = this_rq->clock - ls->load_update_last;
-	ls->load_update_last = this_rq->clock;
-
-	if ((s64)sample_interval64 < (s64)TICK_NSEC)
-		sample_interval64 = TICK_NSEC;
-
-	if (exec_delta64 > sample_interval64)
-		exec_delta64 = sample_interval64;
-
-	idle_delta64 = sample_interval64 - exec_delta64;
-
-	tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
-	tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
-	this_load = (unsigned long)tmp64;
-
-do_avg:
 
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2026,7 +1991,13 @@ do_avg:
 
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
-
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
@@ -3635,10 +3606,9 @@ EXPORT_SYMBOL(default_wake_function);
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
-	struct list_head *tmp, *next;
+	wait_queue_t *curr, *next;
 
-	list_for_each_safe(tmp, next, &q->task_list) {
-		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, sync, key) &&
@@ -4263,8 +4233,10 @@ recheck:
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
+
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
+
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		/*
@@ -4555,7 +4527,7 @@ asmlinkage long sys_sched_yield(void)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	current->sched_class->yield_task(rq, current);
+	current->sched_class->yield_task(rq);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4899,32 +4871,6 @@ #endif
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static inline void sched_init_granularity(void)
-{
-	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long limit = 100000000;
-
-	sysctl_sched_min_granularity *= factor;
-	if (sysctl_sched_min_granularity > limit)
-		sysctl_sched_min_granularity = limit;
-
-	sysctl_sched_latency *= factor;
-	if (sysctl_sched_latency > limit)
-		sysctl_sched_latency = limit;
-
-	sysctl_sched_runtime_limit = sysctl_sched_latency;
-	sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2;
-}
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -6492,12 +6438,10 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
-	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
@@ -6514,7 +6458,6 @@ int in_sched_functions(unsigned long add
 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
-	cfs_rq->fair_clock = 1;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
@@ -6522,7 +6465,6 @@ #endif
 
 void __init sched_init(void)
 {
-	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 
@@ -6545,10 +6487,26 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	 	{
+ 			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ 			struct sched_entity *se =
+ 					 &per_cpu(init_sched_entity, i);
+
+ 			init_cfs_rq_p[i] = cfs_rq;
+ 			init_cfs_rq(cfs_rq, rq);
+ 			cfs_rq->tg = &init_task_grp;
+ 			list_add(&cfs_rq->leaf_cfs_rq_list,
+							 &rq->leaf_cfs_rq_list);
+
+ 			init_sched_entity_p[i] = se;
+ 			se->cfs_rq = &rq->cfs;
+ 			se->my_q = cfs_rq;
+ 			se->load.weight = NICE_0_LOAD;
+			se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ 			se->parent = NULL;
+ 		}
+		init_task_grp.shares = NICE_0_LOAD;
 #endif
-		rq->ls.load_update_last = now;
-		rq->ls.load_update_start = now;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -6642,17 +6600,12 @@ void normalize_rt_tasks(void)
 
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
-		p->se.fair_key			= 0;
-		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
-		p->se.wait_start_fair		= 0;
-		p->se.sleep_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
-		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
 		if (!rt_task(p)) {
@@ -6739,3 +6692,244 @@ void set_curr_task(int cpu, struct task_
 }
 
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* return corresponding task_grp object of a container */
+static inline struct task_grp *container_tg(struct container *cont)
+{
+	return container_of(container_subsys_state(cont, cpu_subsys_id),
+					 struct task_grp, css);
+}
+
+/* allocate runqueue etc for a new task group */
+static struct container_subsys_state *
+sched_create_group(struct container_subsys *ss, struct container *cont)
+{
+	struct task_grp *tg;
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	int i;
+
+	if (!cont->parent) {
+		/* This is early initialization for the top container */
+		init_task_grp.css.container = cont;
+		return &init_task_grp.css;
+	}
+
+	/* we support only 1-level deep hierarchical scheduler atm */
+	if (cont->parent->parent)
+		return ERR_PTR(-EINVAL);
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->cfs_rq)
+		goto err;
+	tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+	if (!tg->se)
+		goto err;
+
+	for_each_possible_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+							 cpu_to_node(i));
+		if (!cfs_rq)
+			goto err;
+
+		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+							cpu_to_node(i));
+		if (!se)
+			goto err;
+
+		memset(cfs_rq, 0, sizeof(struct cfs_rq));
+		memset(se, 0, sizeof(struct sched_entity));
+
+		tg->cfs_rq[i] = cfs_rq;
+		init_cfs_rq(cfs_rq, rq);
+		cfs_rq->tg = tg;
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+		tg->se[i] = se;
+		se->cfs_rq = &rq->cfs;
+		se->my_q = cfs_rq;
+		se->load.weight = NICE_0_LOAD;
+		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+		se->parent = NULL;
+	}
+
+	tg->shares = NICE_0_LOAD;
+
+	/* Bind the container to task_grp object we just created */
+	tg->css.container = cont;
+
+	return &tg->css;
+
+err:
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq && tg->cfs_rq[i])
+			kfree(tg->cfs_rq[i]);
+		if (tg->se && tg->se[i])
+			kfree(tg->se[i]);
+	}
+	if (tg->cfs_rq)
+		kfree(tg->cfs_rq);
+	if (tg->se)
+		kfree(tg->se);
+	if (tg)
+		kfree(tg);
+
+	return ERR_PTR(-ENOMEM);
+}
+
+
+/* destroy runqueue etc associated with a task group */
+static void sched_destroy_group(struct container_subsys *ss,
+					struct container *cont)
+{
+	struct task_grp *tg = container_tg(cont);
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	int i;
+
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	/* wait for possible concurrent references to cfs_rqs complete */
+	synchronize_sched();
+
+	/* now it should be safe to free those cfs_rqs */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		kfree(cfs_rq);
+
+		se = tg->se[i];
+		kfree(se);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg);
+}
+
+static int sched_can_attach(struct container_subsys *ss,
+			     struct container *cont, struct task_struct *tsk)
+{
+	/* We don't support RT-tasks being in separate groups */
+	if (tsk->sched_class != &fair_sched_class)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* change task's runqueue when it moves between groups */
+static void sched_move_task(struct container_subsys *ss, struct container *cont,
+			struct container *old_cont, struct task_struct *tsk)
+{
+	int on_rq, running;
+	unsigned long flags;
+	struct rq *rq;
+
+	rq = task_rq_lock(tsk, &flags);
+
+	if (tsk->sched_class != &fair_sched_class)
+		goto done;
+
+	update_rq_clock(rq);
+
+	running = task_running(rq, tsk);
+	on_rq = tsk->se.on_rq;
+
+	if (on_rq)
+		dequeue_task(rq, tsk, 0);
+
+	set_task_cfs_rq(tsk);
+
+	if (on_rq)
+		enqueue_task(rq, tsk, 0);
+
+done:
+	task_rq_unlock(rq, &flags);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	int on_rq;
+
+	spin_lock_irq(&rq->lock);
+
+	on_rq = se->on_rq;
+	if (on_rq)
+		dequeue_entity(cfs_rq, se, 0);
+
+	se->load.weight = shares;
+	se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+	if (on_rq)
+		enqueue_entity(cfs_rq, se, 0);
+
+	spin_unlock_irq(&rq->lock);
+}
+
+static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
+				struct file *file, const char __user *userbuf,
+				size_t nbytes, loff_t *ppos)
+{
+	int i;
+	unsigned long shareval;
+	struct task_grp *tg = container_tg(cont);
+	char buffer[2*sizeof(unsigned long) + 1];
+
+	if (nbytes > 2*sizeof(unsigned long))	/* safety check */
+		return -E2BIG;
+
+	if (copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;	/* nul-terminate */
+	shareval = simple_strtoul(buffer, NULL, 10);
+
+	tg->shares = shareval;
+	for_each_possible_cpu(i)
+		set_se_shares(tg->se[i], shareval);
+
+	return nbytes;
+}
+
+static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
+{
+	struct task_grp *tg = container_tg(cont);
+
+	return (u64) tg->shares;
+}
+
+struct cftype cpuctl_share = {
+	.name = "shares",
+	.read_uint = cpu_shares_read_uint,
+	.write = cpu_shares_write,
+};
+
+static int sched_populate(struct container_subsys *ss, struct container *cont)
+{
+	return container_add_file(cont, ss, &cpuctl_share);
+}
+
+struct container_subsys cpu_subsys = {
+	.name = "cpu",
+	.create = sched_create_group,
+	.destroy  = sched_destroy_group,
+	.can_attach = sched_can_attach,
+	.attach = sched_move_task,
+	.populate = sched_populate,
+	.subsys_id = cpu_subsys_id,
+	.early_init = 1,
+};
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index c3ee38b..e963cca 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -28,6 +28,31 @@ #define SEQ_printf(m, x...)			\
 		printk(x);			\
  } while (0)
 
+/*
+ * Ease the printing of nsec fields:
+ */
+static inline long long nsec_high(long long nsec)
+{
+	if (nsec < 0) {
+		nsec = -nsec;
+		do_div(nsec, 1000000);
+		return -nsec;
+	}
+	do_div(nsec, 1000000);
+
+	return nsec;
+}
+
+static inline unsigned long nsec_low(long long nsec)
+{
+	if (nsec < 0)
+		nsec = -nsec;
+
+	return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -36,23 +61,19 @@ print_task(struct seq_file *m, struct rq
 	else
 		SEQ_printf(m, " ");
 
-	SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ",
+	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, p->pid,
-		(long long)p->se.fair_key,
-		(long long)(p->se.fair_key - rq->cfs.fair_clock),
-		(long long)p->se.wait_runtime,
+		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		(long long)p->se.sum_exec_runtime,
-		(long long)p->se.sum_wait_runtime,
-		(long long)p->se.sum_sleep_runtime,
-		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
+	SEQ_printf(m, "%15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+		SPLIT_NS(p->se.vruntime),
+		SPLIT_NS(p->se.sum_exec_runtime),
+		SPLIT_NS(p->se.sum_sleep_runtime));
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n",
-		0LL, 0LL, 0LL, 0LL, 0LL);
+	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 }
 
@@ -62,14 +83,10 @@ static void print_rq(struct seq_file *m,
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID        tree-key         delta       waiting"
-	"  switches  prio"
-	"        sum-exec        sum-wait       sum-sleep"
-	"    wait-overrun   wait-underrun\n"
-	"------------------------------------------------------------------"
-	"----------------"
-	"------------------------------------------------"
-	"--------------------------------\n");
+	"            task   PID        tree-key  switches  prio"
+	"    exec-runtime        sum-exec       sum-sleep\n"
+	"------------------------------------------------------"
+	"------------------------------------------------");
 
 	read_lock_irq(&tasklist_lock);
 
@@ -83,45 +100,42 @@ static void print_rq(struct seq_file *m,
 	read_unlock_irq(&tasklist_lock);
 }
 
-static void
-print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 wait_runtime_rq_sum = 0;
-	struct task_struct *p;
-	struct rb_node *curr;
-	unsigned long flags;
+	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+		spread, rq0_min_vruntime, spread0;
 	struct rq *rq = &per_cpu(runqueues, cpu);
+	struct sched_entity *last;
+	unsigned long flags;
 
-	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(cfs_rq);
-	while (curr) {
-		p = rb_entry(curr, struct task_struct, se.run_node);
-		wait_runtime_rq_sum += p->se.wait_runtime;
-
-		curr = rb_next(curr);
-	}
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	SEQ_printf(m, "  .%-30s: %Ld\n", "wait_runtime_rq_sum",
-		(long long)wait_runtime_rq_sum);
-}
-
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
 	SEQ_printf(m, "\ncfs_rq\n");
 
-#define P(x) \
-	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
-
-	P(fair_clock);
-	P(exec_clock);
-	P(wait_runtime);
-	P(wait_runtime_overruns);
-	P(wait_runtime_underruns);
-	P(sleeper_bonus);
-#undef P
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+			SPLIT_NS(cfs_rq->exec_clock));
 
-	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+	spin_lock_irqsave(&rq->lock, flags);
+	if (cfs_rq->rb_leftmost)
+		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+	last = __pick_last_entity(cfs_rq);
+	if (last)
+		max_vruntime = last->vruntime;
+	min_vruntime = rq->cfs.min_vruntime;
+	rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+	spin_unlock_irqrestore(&rq->lock, flags);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+			SPLIT_NS(MIN_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
+			SPLIT_NS(min_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
+			SPLIT_NS(max_vruntime));
+	spread = max_vruntime - MIN_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
+			SPLIT_NS(spread));
+	spread0 = min_vruntime - rq0_min_vruntime;
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
+			SPLIT_NS(spread0));
+	SEQ_printf(m, "  .%-30s: %ld\n", "spread0",
+			cfs_rq->nr_sync_min_vruntime);
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
@@ -141,31 +155,32 @@ #endif
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
 
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
-		   rq->ls.load.weight);
-	P(ls.delta_fair);
-	P(ls.delta_exec);
+		   rq->load.weight);
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
 	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
-	P(next_balance);
+	PN(next_balance);
 	P(curr->pid);
-	P(clock);
-	P(idle_clock);
-	P(prev_clock_raw);
+	PN(clock);
+	PN(idle_clock);
+	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
 	P(clock_deep_idle_events);
-	P(clock_max_delta);
+	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
 #undef P
+#undef PN
 
 	print_cfs_stats(m, cpu);
 
@@ -182,7 +197,7 @@ static int sched_debug_show(struct seq_f
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 
-	SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now);
+	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
 
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
@@ -240,24 +255,22 @@ void proc_sched_show_task(struct task_st
 	SEQ_printf(m, "----------------------------------------------\n");
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
+#define PN(F) \
+	SEQ_printf(m, "%-25s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 
-	P(se.wait_runtime);
-	P(se.wait_start_fair);
-	P(se.exec_start);
-	P(se.sleep_start_fair);
-	P(se.sum_exec_runtime);
+	PN(se.exec_start);
+	PN(se.vruntime);
+	PN(se.sum_exec_runtime);
 
 #ifdef CONFIG_SCHEDSTATS
-	P(se.wait_start);
-	P(se.sleep_start);
-	P(se.block_start);
-	P(se.sleep_max);
-	P(se.block_max);
-	P(se.exec_max);
-	P(se.wait_max);
-	P(se.wait_runtime_overruns);
-	P(se.wait_runtime_underruns);
-	P(se.sum_wait_runtime);
+	PN(se.wait_start);
+	PN(se.sleep_start);
+	PN(se.block_start);
+	PN(se.sleep_max);
+	PN(se.block_max);
+	PN(se.exec_max);
+	PN(se.slice_max);
+	PN(se.wait_max);
 #endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
@@ -265,6 +278,7 @@ #endif
 	P(policy);
 	P(prio);
 #undef P
+#undef PN
 
 	{
 		u64 t0, t1;
@@ -279,9 +293,12 @@ #undef P
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
-	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+	p->se.sleep_max			= 0;
+	p->se.block_max			= 0;
+	p->se.exec_max			= 0;
+	p->se.slice_max			= 0;
+	p->se.wait_max			= 0;
 #endif
-	p->se.sum_exec_runtime = 0;
+	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c9fbe8e..10eceb7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -34,7 +34,13 @@
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  * Targeted preemption latency for CPU-bound tasks:
  */
-unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
+const_debug unsigned int sysctl_sched_latency = 20000000ULL;
+
+/*
+ * After fork, child runs first. (default) If set to 0 then
+ * parent will (try to) run first.
+ */
+const_debug unsigned int sysctl_sched_child_runs_first = 1;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
@@ -58,7 +64,7 @@ unsigned int __read_mostly sysctl_sched_
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL;
+const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
@@ -68,35 +74,10 @@ unsigned int sysctl_sched_batch_wakeup_g
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL;
-
-unsigned int sysctl_sched_stat_granularity __read_mostly;
+const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
 
-/*
- * Initialized in sched_init_granularity() [to 5 times the base granularity]:
- */
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
-/*
- * Debugging: various feature bits
- */
-enum {
-	SCHED_FEAT_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_SLEEPER_AVG		= 2,
-	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
-	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
-	SCHED_FEAT_START_DEBIT		= 16,
-	SCHED_FEAT_SKIP_INITIAL		= 32,
-};
-
-unsigned int sysctl_sched_features __read_mostly =
-		SCHED_FEAT_FAIR_SLEEPERS	*1 |
-		SCHED_FEAT_SLEEPER_AVG		*0 |
-		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
-		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
-		SCHED_FEAT_START_DEBIT		*1 |
-		SCHED_FEAT_SKIP_INITIAL		*0;
-
 extern struct sched_class fair_sched_class;
 
 /**************************************************************
@@ -111,21 +92,9 @@ static inline struct rq *rq_of(struct cf
 	return cfs_rq->rq;
 }
 
-/* currently running entity (if any) on this cfs_rq */
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->curr;
-}
-
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->curr = se;
-}
-
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -133,21 +102,8 @@ static inline struct rq *rq_of(struct cf
 	return container_of(cfs_rq, struct rq, cfs);
 }
 
-static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
-{
-	struct rq *rq = rq_of(cfs_rq);
-
-	if (unlikely(rq->curr->sched_class != &fair_sched_class))
-		return NULL;
-
-	return &rq->curr->se;
-}
-
 #define entity_is_task(se)	1
 
-static inline void
-set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct task_struct *task_of(struct sched_entity *se)
@@ -160,16 +116,42 @@ static inline struct task_struct *task_o
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline u64
+max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+	if ((vruntime > min_vruntime) ||
+	    (min_vruntime > (1ULL << 61) && vruntime < (1ULL << 50)))
+		min_vruntime = vruntime;
+
+	return min_vruntime;
+}
+
+static inline void
+set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
+{
+	struct sched_entity *se;
+
+	cfs_rq->rb_leftmost = leftmost;
+	if (leftmost)
+		se = rb_entry(leftmost, struct sched_entity, run_node);
+}
+
+static inline s64
+entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return se->vruntime - cfs_rq->min_vruntime;
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
-static inline void
+static void
 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
-	s64 key = se->fair_key;
+	s64 key = entity_key(cfs_rq, se);
 	int leftmost = 1;
 
 	/*
@@ -182,7 +164,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, 
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
-		if (key - entry->fair_key < 0) {
+		if (key < entity_key(cfs_rq, entry)) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
@@ -195,28 +177,19 @@ __enqueue_entity(struct cfs_rq *cfs_rq, 
 	 * used):
 	 */
 	if (leftmost)
-		cfs_rq->rb_leftmost = &se->run_node;
+		set_leftmost(cfs_rq, &se->run_node);
 
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_add(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running++;
-	se->on_rq = 1;
-
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 
-static inline void
+static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = rb_next(&se->run_node);
-	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-	update_load_sub(&cfs_rq->load, se->load.weight);
-	cfs_rq->nr_running--;
-	se->on_rq = 0;
+		set_leftmost(cfs_rq, rb_next(&se->run_node));
 
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
@@ -229,118 +202,47 @@ static struct sched_entity *__pick_next_
 	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 }
 
-/**************************************************************
- * Scheduling class statistics methods:
- */
-
-/*
- * Calculate the preemption granularity needed to schedule every
- * runnable task once per sysctl_sched_latency amount of time.
- * (down to a sensible low limit on granularity)
- *
- * For example, if there are 2 tasks running and latency is 10 msecs,
- * we switch tasks every 5 msecs. If we have 3 tasks running, we have
- * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
- * for each task. We do finer and finer scheduling up to until we
- * reach the minimum granularity value.
- *
- * To achieve this we use the following dynamic-granularity rule:
- *
- *    gran = lat/nr - lat/nr/nr
- *
- * This comes out of the following equations:
- *
- *    kA1 + gran = kB1
- *    kB2 + gran = kA2
- *    kA2 = kA1
- *    kB2 = kB1 - d + d/nr
- *    lat = d * nr
- *
- * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
- * '1' is start of time, '2' is end of time, 'd' is delay between
- * 1 and 2 (during which task B was running), 'nr' is number of tasks
- * running, 'lat' is the the period of each task. ('lat' is the
- * sched_latency that we aim for.)
- */
-static long
-sched_granularity(struct cfs_rq *cfs_rq)
+static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	unsigned int gran = sysctl_sched_latency;
-	unsigned int nr = cfs_rq->nr_running;
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *se = NULL;
+	struct rb_node *parent;
 
-	if (nr > 1) {
-		gran = gran/nr - gran/nr/nr;
-		gran = max(gran, sysctl_sched_min_granularity);
+	while (*link) {
+		parent = *link;
+		se = rb_entry(parent, struct sched_entity, run_node);
+		link = &parent->rb_right;
 	}
 
-	return gran;
+	return se;
 }
 
-/*
- * We rescale the rescheduling granularity of tasks according to their
- * nice level, but only linearly, not exponentially:
+/**************************************************************
+ * Scheduling class statistics methods:
  */
-static long
-niced_granularity(struct sched_entity *curr, unsigned long granularity)
+
+static u64 __sched_period(unsigned long nr_running)
 {
-	u64 tmp;
+	u64 period = sysctl_sched_latency;
+	unsigned long nr_latency =
+		sysctl_sched_latency / sysctl_sched_min_granularity;
 
-	if (likely(curr->load.weight == NICE_0_LOAD))
-		return granularity;
-	/*
-	 * Positive nice levels get the same granularity as nice-0:
-	 */
-	if (likely(curr->load.weight < NICE_0_LOAD)) {
-		tmp = curr->load.weight * (u64)granularity;
-		return (long) (tmp >> NICE_0_SHIFT);
+	if (unlikely(nr_running > nr_latency)) {
+		period *= nr_running;
+		do_div(period, nr_latency);
 	}
-	/*
-	 * Negative nice level tasks get linearly finer
-	 * granularity:
-	 */
-	tmp = curr->load.inv_weight * (u64)granularity;
 
-	/*
-	 * It will always fit into 'long':
-	 */
-	return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT));
+	return period;
 }
 
-static inline void
-limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	long limit = sysctl_sched_runtime_limit;
+	u64 period = __sched_period(cfs_rq->nr_running);
 
-	/*
-	 * Niced tasks have the same history dynamic range as
-	 * non-niced tasks:
-	 */
-	if (unlikely(se->wait_runtime > limit)) {
-		se->wait_runtime = limit;
-		schedstat_inc(se, wait_runtime_overruns);
-		schedstat_inc(cfs_rq, wait_runtime_overruns);
-	}
-	if (unlikely(se->wait_runtime < -limit)) {
-		se->wait_runtime = -limit;
-		schedstat_inc(se, wait_runtime_underruns);
-		schedstat_inc(cfs_rq, wait_runtime_underruns);
-	}
-}
+	period *= se->load.weight;
+	do_div(period, cfs_rq->load.weight);
 
-static inline void
-__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	se->wait_runtime += delta;
-	schedstat_add(se, sum_wait_runtime, delta);
-	limit_wait_runtime(cfs_rq, se);
-}
-
-static void
-add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
-{
-	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
-	__add_wait_runtime(cfs_rq, se, delta);
-	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
+	return period;
 }
 
 /*
@@ -348,173 +250,111 @@ add_wait_runtime(struct cfs_rq *cfs_rq, 
  * are not in our scheduling class.
  */
 static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+	      unsigned long delta_exec)
 {
-	unsigned long delta, delta_exec, delta_fair, delta_mine;
-	struct load_weight *lw = &cfs_rq->load;
-	unsigned long load = lw->weight;
+	unsigned long delta_exec_weighted;
+	u64 next_vruntime, min_vruntime;
 
-	delta_exec = curr->delta_exec;
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
-	cfs_rq->exec_clock += delta_exec;
-
-	if (unlikely(!load))
-		return;
-
-	delta_fair = calc_delta_fair(delta_exec, lw);
-	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
-
-	if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) {
-		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
-		delta = min(delta, (unsigned long)(
-			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
-		cfs_rq->sleeper_bonus -= delta;
-		delta_mine -= delta;
+	schedstat_add(cfs_rq, exec_clock, delta_exec);
+	delta_exec_weighted = delta_exec;
+	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+							&curr->load);
 	}
+	curr->vruntime += delta_exec_weighted;
 
-	cfs_rq->fair_clock += delta_fair;
 	/*
-	 * We executed delta_exec amount of time on the CPU,
-	 * but we were only entitled to delta_mine amount of
-	 * time during that period (if nr_running == 1 then
-	 * the two values are equal)
-	 * [Note: delta_mine - delta_exec is negative]:
+	 * maintain cfs_rq->min_vruntime to be a monotonic increasing
+	 * value tracking the leftmost vruntime in the tree.
 	 */
-	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
+	if (first_fair(cfs_rq)) {
+		next_vruntime = __pick_next_entity(cfs_rq)->vruntime;
+
+		/* min_vruntime() := !max_vruntime() */
+		min_vruntime = max_vruntime(curr->vruntime, next_vruntime);
+		if (min_vruntime == next_vruntime)
+			min_vruntime = curr->vruntime;
+		else
+			min_vruntime = next_vruntime;
+	} else
+		min_vruntime = curr->vruntime;
+
+	cfs_rq->min_vruntime =
+		max_vruntime(cfs_rq->min_vruntime, min_vruntime);
+}
+
+static void sync_vruntime(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		rq = &per_cpu(runqueues, cpu);
+		cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime,
+				rq->cfs.min_vruntime);
+	}
+	schedstat_inc(cfs_rq, nr_sync_min_vruntime);
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *curr = cfs_rq->curr;
+	u64 now = rq_of(cfs_rq)->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
-		return;
+		return sync_vruntime(cfs_rq);
 
 	/*
 	 * Get the amount of time the current task was running
 	 * since the last time we changed load (this cannot
 	 * overflow on 32 bits):
 	 */
-	delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start);
+	delta_exec = (unsigned long)(now - curr->exec_start);
 
-	curr->delta_exec += delta_exec;
-
-	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
-		__update_curr(cfs_rq, curr);
-		curr->delta_exec = 0;
-	}
-	curr->exec_start = rq_of(cfs_rq)->clock;
+	__update_curr(cfs_rq, curr, delta_exec);
+	curr->exec_start = now;
 }
 
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	se->wait_start_fair = cfs_rq->fair_clock;
 	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 }
 
-/*
- * We calculate fair deltas here, so protect against the random effects
- * of a multiplication overflow by capping it to the runtime limit:
- */
-#if BITS_PER_LONG == 32
 static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
+calc_weighted(unsigned long delta, struct sched_entity *se)
 {
-	u64 tmp = (u64)delta * weight >> shift;
+	unsigned long weight = se->load.weight;
 
-	if (unlikely(tmp > sysctl_sched_runtime_limit*2))
-		return sysctl_sched_runtime_limit*2;
-	return tmp;
-}
-#else
-static inline unsigned long
-calc_weighted(unsigned long delta, unsigned long weight, int shift)
-{
-	return delta * weight >> shift;
+	if (unlikely(weight != NICE_0_LOAD))
+		return (u64)delta * se->load.weight >> NICE_0_SHIFT;
+	else
+		return delta;
 }
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 key;
-
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_start(cfs_rq, se);
-	/*
-	 * Update the key:
-	 */
-	key = cfs_rq->fair_clock;
-
-	/*
-	 * Optimize the common nice 0 case:
-	 */
-	if (likely(se->load.weight == NICE_0_LOAD)) {
-		key -= se->wait_runtime;
-	} else {
-		u64 tmp;
-
-		if (se->wait_runtime < 0) {
-			tmp = -se->wait_runtime;
-			key += (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		} else {
-			tmp = se->wait_runtime;
-			key -= (tmp * se->load.inv_weight) >>
-					(WMULT_SHIFT - NICE_0_SHIFT);
-		}
-	}
-
-	se->fair_key = key;
-}
-
-/*
- * Note: must be called with a freshly updated rq->fair_clock.
- */
-static inline void
-__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	unsigned long delta_fair = se->delta_fair_run;
-
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	add_wait_runtime(cfs_rq, se, delta_fair);
 }
 
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long delta_fair;
-
-	if (unlikely(!se->wait_start_fair))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
-
-	se->delta_fair_run += delta_fair;
-	if (unlikely(abs(se->delta_fair_run) >=
-				sysctl_sched_stat_granularity)) {
-		__update_stats_wait_end(cfs_rq, se);
-		se->delta_fair_run = 0;
-	}
-
-	se->wait_start_fair = 0;
+	schedstat_set(se->wait_max, max(se->wait_max,
+			rq_of(cfs_rq)->clock - se->wait_start));
 	schedstat_set(se->wait_start, 0);
 }
 
@@ -526,7 +366,7 @@ update_stats_dequeue(struct cfs_rq *cfs_
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
-	if (se != cfs_rq_curr(cfs_rq))
+	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 }
 
@@ -555,66 +395,24 @@ update_stats_curr_end(struct cfs_rq *cfs
  * Scheduling class queueing methods:
  */
 
-static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long load = cfs_rq->load.weight, delta_fair;
-	long prev_runtime;
-
-	/*
-	 * Do not boost sleepers if there's too much bonus 'in flight'
-	 * already:
-	 */
-	if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit))
-		return;
-
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
-		load = rq_of(cfs_rq)->cpu_load[2];
-
-	delta_fair = se->delta_fair_sleep;
-
-	/*
-	 * Fix up delta_fair with the effect of us running
-	 * during the whole sleep period:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
-		delta_fair = div64_likely32((u64)delta_fair * load,
-						load + se->load.weight);
-
-	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = calc_weighted(delta_fair, se->load.weight,
-							NICE_0_SHIFT);
-
-	prev_runtime = se->wait_runtime;
-	__add_wait_runtime(cfs_rq, se, delta_fair);
-	delta_fair = se->wait_runtime - prev_runtime;
+	update_load_add(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running++;
+	se->on_rq = 1;
+}
 
-	/*
-	 * Track the amount of bonus we've given to sleepers:
-	 */
-	cfs_rq->sleeper_bonus += delta_fair;
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	update_load_sub(&cfs_rq->load, se->load.weight);
+	cfs_rq->nr_running--;
+	se->on_rq = 0;
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct task_struct *tsk = task_of(se);
-	unsigned long delta_fair;
-
-	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
-		return;
-
-	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
-		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
-
-	se->delta_fair_sleep += delta_fair;
-	if (unlikely(abs(se->delta_fair_sleep) >=
-				sysctl_sched_stat_granularity)) {
-		__enqueue_sleeper(cfs_rq, se);
-		se->delta_fair_sleep = 0;
-	}
-
-	se->sleep_start_fair = 0;
-
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
@@ -644,27 +442,71 @@ #endif
 }
 
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
+	u64 min_runtime, latency;
+
+	min_runtime = cfs_rq->min_vruntime;
+
+	if (sched_feat(USE_TREE_AVG)) {
+		struct sched_entity *last = __pick_last_entity(cfs_rq);
+		if (last) {
+			min_runtime = __pick_next_entity(cfs_rq)->vruntime;
+			min_runtime += last->vruntime;
+			min_runtime >>= 1;
+		}
+	} else if (sched_feat(APPROX_AVG))
+		min_runtime += sysctl_sched_latency/2;
+
+	if (initial && sched_feat(START_DEBIT))
+		min_runtime += sched_slice(cfs_rq, se);
+
+	if (!initial && sched_feat(NEW_FAIR_SLEEPERS)) {
+		latency = sysctl_sched_latency;
+		if (min_runtime > latency)
+			min_runtime -= latency;
+		else
+			min_runtime = 0;
+	}
+
+	se->vruntime = max(se->vruntime, min_runtime);
+}
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int wakeup, int set_curr)
+{
+	/*
+	 * In case of the 'current'.
+	 */
+	if (unlikely(set_curr)) {
+		update_stats_curr_start(cfs_rq, se);
+		cfs_rq->curr = se;
+		account_entity_enqueue(cfs_rq, se);
+		return;
+	}
+
 	/*
 	 * Update the fair clock.
 	 */
 	update_curr(cfs_rq);
 
-	if (wakeup)
+	if (wakeup) {
+		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
+	}
 
 	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
 }
 
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
 	update_stats_dequeue(cfs_rq, se);
-	if (sleep) {
-		se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
+	if (sleep) {
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 
@@ -673,46 +515,28 @@ #ifdef CONFIG_SCHEDSTATS
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = rq_of(cfs_rq)->clock;
 		}
+	}
 #endif
+	if (likely(se != cfs_rq->curr))
+		__dequeue_entity(cfs_rq, se);
+	else {
+		update_stats_curr_end(cfs_rq, se);
+		cfs_rq->curr = NULL;
 	}
-	__dequeue_entity(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
 }
 
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void
-__check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			  struct sched_entity *curr, unsigned long granularity)
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	s64 __delta = curr->fair_key - se->fair_key;
 	unsigned long ideal_runtime, delta_exec;
 
-	/*
-	 * ideal_runtime is compared against sum_exec_runtime, which is
-	 * walltime, hence do not scale.
-	 */
-	ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running,
-			(unsigned long)sysctl_sched_min_granularity);
-
-	/*
-	 * If we executed more than what the latency constraint suggests,
-	 * reduce the rescheduling granularity. This way the total latency
-	 * of how much a task is not scheduled converges to
-	 * sysctl_sched_latency:
-	 */
+	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime)
-		granularity = 0;
-
-	/*
-	 * Take scheduling granularity into account - do not
-	 * preempt the current task unless the best task has
-	 * a larger than sched_granularity fairness advantage:
-	 *
-	 * scale granularity as key space is in fair_clock.
-	 */
-	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
@@ -722,13 +546,22 @@ set_next_entity(struct cfs_rq *cfs_rq, s
 	/*
 	 * Any task has to be enqueued before it get to execute on
 	 * a CPU. So account for the time it spent waiting on the
-	 * runqueue. (note, here we rely on pick_next_task() having
-	 * done a put_prev_task_fair() shortly before this, which
-	 * updated rq->fair_clock - used by update_stats_wait_end())
+	 * runqueue.
 	 */
 	update_stats_wait_end(cfs_rq, se);
 	update_stats_curr_start(cfs_rq, se);
-	set_cfs_rq_curr(cfs_rq, se);
+	cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+		se->slice_max = max(se->slice_max,
+			se->sum_exec_runtime - se->prev_sum_exec_runtime);
+	}
+#endif
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
@@ -736,6 +569,10 @@ static struct sched_entity *pick_next_en
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 
+	/* 'current' is not kept within the tree. */
+	if (se)
+		__dequeue_entity(cfs_rq, se);
+
 	set_next_entity(cfs_rq, se);
 
 	return se;
@@ -752,31 +589,23 @@ static void put_prev_entity(struct cfs_r
 
 	update_stats_curr_end(cfs_rq, prev);
 
-	if (prev->on_rq)
+	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
-	set_cfs_rq_curr(cfs_rq, NULL);
+		/* Put 'current' back into the tree. */
+		__enqueue_entity(cfs_rq, prev);
+	}
+	cfs_rq->curr = NULL;
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *next;
-
-	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
-	 */
-	dequeue_entity(cfs_rq, curr, 0);
-	enqueue_entity(cfs_rq, curr, 0);
-
 	/*
-	 * Reschedule if another task tops the current one.
+	 * Update run-time statistics of the 'current'.
 	 */
-	next = __pick_next_entity(cfs_rq);
-	if (next == curr)
-		return;
+	update_curr(cfs_rq);
 
-	__check_preempt_curr_fair(cfs_rq, next, curr,
-			sched_granularity(cfs_rq));
+	if (cfs_rq->nr_running > 1)
+		check_preempt_tick(cfs_rq, curr);
 }
 
 /**************************************************
@@ -811,8 +640,7 @@ static inline struct cfs_rq *group_cfs_r
  */
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
-	/* A later patch will take group into account */
-	return &cpu_rq(this_cpu)->cfs;
+	return cfs_rq->tg->cfs_rq[this_cpu];
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
@@ -876,12 +704,17 @@ static void enqueue_task_fair(struct rq 
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int set_curr = 0;
+
+	/* Are we enqueuing the current task? */
+	if (unlikely(task_running(rq, p)))
+		set_curr = 1;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup);
+		enqueue_entity(cfs_rq, se, wakeup, set_curr);
 	}
 }
 
@@ -909,11 +742,11 @@ static void dequeue_task_fair(struct rq 
  *
  * If compat_yield is turned on then we requeue to the end of the tree.
  */
-static void yield_task_fair(struct rq *rq, struct task_struct *p)
+static void yield_task_fair(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct sched_entity *rightmost, *se = &p->se;
+	struct sched_entity *rightmost, *se = &rq->curr->se;
 	struct rb_node *parent;
 
 	/*
@@ -928,8 +761,8 @@ static void yield_task_fair(struct rq *r
 		 * Dequeue and enqueue the task to update its
 		 * position within the tree:
 		 */
-		dequeue_entity(cfs_rq, &p->se, 0);
-		enqueue_entity(cfs_rq, &p->se, 0);
+		dequeue_entity(cfs_rq, se, 0);
+		enqueue_entity(cfs_rq, se, 0, 1);
 
 		return;
 	}
@@ -951,7 +784,7 @@ static void yield_task_fair(struct rq *r
 	/*
 	 * Minimally necessary key value to be last in the tree:
 	 */
-	se->fair_key = rightmost->fair_key + 1;
+	se->vruntime = rightmost->vruntime + 1;
 
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
@@ -966,11 +799,10 @@ static void yield_task_fair(struct rq *r
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -978,16 +810,12 @@ static void check_preempt_curr_fair(stru
 		resched_task(curr);
 		return;
 	}
+	if (is_same_group(curr, p)) {
+		s64 delta = curr->se.vruntime - p->se.vruntime;
 
-	gran = sysctl_sched_wakeup_granularity;
-	/*
-	 * Batch tasks prefer throughput over latency:
-	 */
-	if (unlikely(p->policy == SCHED_BATCH))
-		gran = sysctl_sched_batch_wakeup_granularity;
-
-	if (is_same_group(curr, p))
-		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+		if (delta > (s64)sysctl_sched_wakeup_granularity)
+			resched_task(curr);
+	}
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1143,6 +971,8 @@ static void task_tick_fair(struct rq *rq
 	}
 }
 
+#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
+
 /*
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
@@ -1153,55 +983,28 @@ static void task_tick_fair(struct rq *rq
 static void task_new_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq);
+	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 
 	sched_info_queued(p);
 
 	update_curr(cfs_rq);
-	update_stats_enqueue(cfs_rq, se);
-	/*
-	 * Child runs first: we let it run before the parent
-	 * until it reschedules once. We set up the key so that
-	 * it will preempt the parent:
-	 */
-	se->fair_key = curr->fair_key -
-		niced_granularity(curr, sched_granularity(cfs_rq)) - 1;
-	/*
-	 * The first wait is dominated by the child-runs-first logic,
-	 * so do not credit it with that waiting time yet:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
-		se->wait_start_fair = 0;
+	place_entity(cfs_rq, se, 1);
 
-	/*
-	 * The statistical average of wait_runtime is about
-	 * -granularity/2, so initialize the task with that:
-	 */
-	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
+	if (sysctl_sched_child_runs_first &&
+			curr->vruntime < se->vruntime) {
+		/*
+		 * Upon rescheduling, sched_class::put_prev_task() will place
+		 * 'current' within the tree based on its new key value.
+		 */
+		swap(curr->vruntime, se->vruntime);
+	}
 
+	update_stats_enqueue(cfs_rq, se);
 	__enqueue_entity(cfs_rq, se);
+	account_entity_enqueue(cfs_rq, se);
+	resched_task(rq->curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_curr_task_fair(struct rq *rq)
-{
-	struct sched_entity *se = &rq->curr->se;
-
-	for_each_sched_entity(se)
-		set_next_entity(cfs_rq_of(se), se);
-}
-#else
-static void set_curr_task_fair(struct rq *rq)
-{
-}
-#endif
-
 /*
  * All the scheduling class methods:
  */
@@ -1210,14 +1013,13 @@ struct sched_class fair_sched_class __re
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 
-	.check_preempt_curr	= check_preempt_curr_fair,
+	.check_preempt_curr	= check_preempt_wakeup,
 
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 
 	.load_balance		= load_balance_fair,
 
-	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4b87476..3c77c03 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -59,9 +59,9 @@ static void requeue_task_rt(struct rq *r
 }
 
 static void
-yield_task_rt(struct rq *rq, struct task_struct *p)
+yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, p);
+	requeue_task_rt(rq, rq->curr);
 }
 
 /*
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c20a94d..1d9ec98 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -129,7 +129,7 @@ # define schedstat_add(rq, field, amt)	d
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -233,5 +233,5 @@ sched_info_switch(struct task_struct *pr
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHEDSTATS */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 53a456e..ff7ae03 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -266,28 +266,6 @@ #ifdef CONFIG_SCHED_DEBUG
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_stat_granularity_ns",
-		.data		= &sysctl_sched_stat_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_runtime_limit_ns",
-		.data		= &sysctl_sched_runtime_limit,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_sched_granularity_ns,
-		.extra2		= &max_sched_granularity_ns,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
 		.maxlen		= sizeof(unsigned int),