GIT 1c88e4d10d5a4da331a35a756a718efe1d69378c git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git

commit 
Author: Ingo Molnar <mingo@elte.hu>
Date:   Tue Nov 27 13:49:21 2007 +0100

    cpu hotplug: fix build on !CONFIG_SMP
    
    fix build on !CONFIG_SMP.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit ba086e4c8496e8865323eb512e2f7a608c904878
Author: Gautham R Shenoy <ego@in.ibm.com>
Date:   Tue Nov 27 13:49:21 2007 +0100

    cpu-hotplug: replace per-subsystem mutexes with get_online_cpus()
    
    This patch converts the known per-subsystem mutexes to get_online_cpus
    put_online_cpus. It also eliminates the CPU_LOCK_ACQUIRE and
    CPU_LOCK_RELEASE hotplug notification events.
    
    Signed-off-by: Gautham  R Shenoy <ego@in.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit e6315e4be48dd4d652b229f8a4888083d20a1580
Author: Gautham R Shenoy <ego@in.ibm.com>
Date:   Tue Nov 27 13:49:20 2007 +0100

    cpu-hotplug: replace lock_cpu_hotplug() with get_online_cpus()
    
    Replace all lock_cpu_hotplug/unlock_cpu_hotplug from the kernel and use
    get_online_cpus and put_online_cpus instead as it highlights the
    refcount semantics in these operations.
    
    The new API guarantees protection against the cpu-hotplug operation, but
    it doesn't guarantee serialized access to any of the local data
    structures. Hence the changes needs to be reviewed.
    
    In case of pseries_add_processor/pseries_remove_processor, use
    cpu_maps_update_begin()/cpu_maps_update_done() as we're modifying the
    cpu_present_map there.
    
    Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 7165eb6dbb52b3b6a6d21db7578d56abf6ffa457
Author: Gautham R Shenoy <ego@in.ibm.com>
Date:   Tue Nov 27 13:49:20 2007 +0100

    cpu-hotplug: refcount based cpu hotplug
    
    This patch implements a Refcount + Waitqueue based model for
    cpu-hotplug.
    
    Now, a thread which wants to prevent cpu-hotplug, will bump up a global
    refcount and the thread which wants to perform a cpu-hotplug operation
    will block till the global refcount goes to zero.
    
    The readers, if any, during an ongoing cpu-hotplug operation are blocked
    until the cpu-hotplug operation is over.
    
    Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
    Signed-off-by: Paul Jackson <pj@sgi.com> [For !CONFIG_HOTPLUG_CPU ]
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit d6b599c384481763738d6b79666c44a431e40807
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Tue Nov 27 13:49:20 2007 +0100

    sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
    
    The current load balancing scheme isn't good enough for precise
    group fairness.
    
    For example: on a 8-cpu system, I created 3 groups as under:
    
    	a = 8 tasks (cpu.shares = 1024)
    	b = 4 tasks (cpu.shares = 1024)
    	c = 3 tasks (cpu.shares = 1024)
    
    a, b and c are task groups that have equal weight. We would expect each
    of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
    
    This is what I get with the latest scheduler git tree:
    
    --------------------------------------------------------------------------------
    Col1  | Col2    | Col3  |  Col4
    ------|---------|-------|-------------------------------------------------------
    a     | 277.676 | 57.8% | 54.1%  54.1%  54.1%  54.2%  56.7%  62.2%  62.8% 64.5%
    b     | 116.108 | 24.2% | 47.4%  48.1%  48.7%  49.3%
    c     |  86.326 | 18.0% | 47.5%  47.9%  48.5%
    --------------------------------------------------------------------------------
    
    Explanation of o/p:
    
    Col1 -> Group name
    Col2 -> Cumulative execution time (in seconds) received by all tasks of that
    	group in a 60sec window across 8 cpus
    Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
            percentage. Col3 data is derived as:
    		Col3 = 100 * Col2 / (NR_CPUS * 60)
    Col4 -> CPU bandwidth received by each individual task of the group.
    		Col4 = 100 * cpu_time_recd_by_task / 60
    
    [I can share the test case that produces a similar o/p if reqd]
    
    The deviation from desired group fairness is as below:
    
    	a = +24.47%
    	b = -9.13%
    	c = -15.33%
    
    which is quite high.
    
    After the patch below is applied, here are the results:
    
    --------------------------------------------------------------------------------
    Col1  | Col2    | Col3  |  Col4
    ------|---------|-------|-------------------------------------------------------
    a     | 163.112 | 34.0% | 33.2%  33.4%  33.5%  33.5%  33.7%  34.4%  34.8% 35.3%
    b     | 156.220 | 32.5% | 63.3%  64.5%  66.1%  66.5%
    c     | 160.653 | 33.5% | 85.8%  90.6%  91.4%
    --------------------------------------------------------------------------------
    
    Deviation from desired group fairness is as below:
    
    	a = +0.67%
    	b = -0.83%
    	c = +0.17%
    
    which is far better IMO. Most of other runs have yielded a deviation within
    +-2% at the most, which is good.
    
    Why do we see bad (group) fairness with current scheuler?
    =========================================================
    
    Currently cpu's weight is just the summation of individual task weights.
    This can yield incorrect results. For ex: consider three groups as below
    on a 2-cpu system:
    
    	CPU0	CPU1
    ---------------------------
    	A (10)  B(5)
    		C(5)
    ---------------------------
    
    Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
    of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
    1024).
    
    The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
    the load balancer will think both CPUs are perfectly balanced and won't
    move around any tasks. This, however, would yield this bandwidth:
    
    	A = 50%
    	B = 25%
    	C = 25%
    
    which is not the desired result.
    
    What's changing in the patch?
    =============================
    
    	- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
    	  defined (see below)
    	- API Change
    		- Two tunables introduced in sysfs (under SCHED_DEBUG) to
    		  control the frequency at which the load balance monitor
    		  thread runs.
    
    The basic change made in this patch is how cpu weight (rq->load.weight) is
    calculated. Its now calculated as the summation of group weights on a cpu,
    rather than summation of task weights. Weight exerted by a group on a
    cpu is dependent on the shares allocated to it and also the number of
    tasks the group has on that cpu compared to the total number of
    (runnable) tasks the group has in the system.
    
    Let,
    	W(K,i)  = Weight of group K on cpu i
    	T(K,i)  = Task load present in group K's cfs_rq on cpu i
    	T(K)    = Total task load of group K across various cpus
    	S(K) 	= Shares allocated to group K
    	NRCPUS	= Number of online cpus in the scheduler domain to
    	 	  which group K is assigned.
    
    Then,
    	W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
    
    A load balance monitor thread is created at bootup, which periodically
    runs and adjusts group's weight on each cpu. To avoid its overhead, two
    min/max tunables are introduced (under SCHED_DEBUG) to control the rate
    at which it runs.
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 88e2db3c636d709ab7d6c1afa49e29777c6997b9
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Tue Nov 27 13:49:19 2007 +0100

    sched: introduce a mutex and corresponding API to serialize access to doms_cur[] array
    
    doms_cur[] array represents various scheduling domains which are
    mutually exclusive. Currently cpusets code can modify this array (by
    calling partition_sched_domains()) as a result of user modifying
    sched_load_balance flag for various cpusets.
    
    This patch introduces a mutex and corresponding API (only when
    CONFIG_FAIR_GROUP_SCHED is defined) which allows a reader to safely read
    the doms_cur[] array w/o worrying abt concurrent modifications to the
    array.
    
    The fair group scheduler code (introduced in next patch of this series)
    makes use of this mutex to walk thr' doms_cur[] array while rebalancing
    shares of task groups across cpus.
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit f62e652cf70f33a4c5c7a3469863907bf113b650
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Tue Nov 27 13:49:19 2007 +0100

    sched: group scheduling, change how cpu load is calculated
    
    This patch changes how the cpu load exerted by fair_sched_class tasks
    is calculated. Load exerted by fair_sched_class tasks on a cpu is now
    a summation of the group weights, rather than summation of task weights.
    Weight exerted by a group on a cpu is dependent on the shares allocated
    to it.
    
    This version of patch has a minor impact on code size, but should have
    no runtime/functional impact for !CONFIG_FAIR_GROUP_SCHED.
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 6f814d8482a577469a51cf6a002620bf5db18aa6
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Tue Nov 27 13:49:19 2007 +0100

    sched: group scheduling, minor fixes
    
    Minor bug fixes for the group scheduler:
    
    - Use a mutex to serialize add/remove of task groups and also when
      changing shares of a task group. Use the same mutex when printing
      cfs_rq debugging stats for various task groups.
    
    - Use list_for_each_entry_rcu in for_each_leaf_cfs_rq macro (when
      walking task group list)
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit 4370e087585a2da92447f4d84596f3833380599e
Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date:   Tue Nov 27 13:49:19 2007 +0100

    sched: group scheduling code cleanup
    
    Minor cleanups:
    
    - Fix coding style
    - remove obsolete comment
    
    Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
 Documentation/cpu-hotplug.txt                |   11 +-
 arch/mips/kernel/mips-mt-fpaff.c             |   10 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c |    8 +-
 arch/powerpc/platforms/pseries/rtasd.c       |    8 +-
 arch/x86/kernel/cpu/mtrr/main.c              |    8 +-
 arch/x86/kernel/microcode.c                  |   16 +-
 drivers/lguest/x86/core.c                    |    8 +-
 drivers/s390/char/sclp_config.c              |    4 +-
 include/linux/cpu.h                          |   17 +-
 include/linux/notifier.h                     |    4 +-
 include/linux/sched.h                        |    4 +
 init/main.c                                  |    1 +
 kernel/cpu.c                                 |  164 ++++++++----
 kernel/cpuset.c                              |   14 +-
 kernel/rcutorture.c                          |    6 +-
 kernel/sched.c                               |  384 +++++++++++++++++++++-----
 kernel/sched_fair.c                          |  119 ++++++---
 kernel/sched_rt.c                            |    2 +
 kernel/stop_machine.c                        |    4 +-
 kernel/sysctl.c                              |   18 ++
 kernel/workqueue.c                           |   35 +--
 mm/slab.c                                    |   18 +-
 net/core/flow.c                              |    4 +-
 23 files changed, 634 insertions(+), 233 deletions(-)

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index a741f65..fb94f5a 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 
 	#include <linux/cpu.h>
-	lock_cpu_hotplug() and unlock_cpu_hotplug():
+	get_online_cpus() and put_online_cpus():
 
-The above calls are used to inhibit cpu hotplug operations. While holding the
-cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
-cpus going away, you could also use preempt_disable() and preempt_enable()
-for those sections. Just remember the critical section cannot call any
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
 function that can sleep or schedule this process away. The preempt_disable()
 will work as long as stop_machine_run() is used to take a cpu down.
 
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 892665b..bb4f00c 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
 		return -EFAULT;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	put_task_struct(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return retval;
 }
 
@@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	if (len < real_len)
 		return -EINVAL;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (retval)
 		return retval;
 	if (copy_to_user(user_mask_ptr, &mask, real_len))
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index fc48b96..67b8016 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -151,7 +151,7 @@ static int pseries_add_processor(struct device_node *np)
 	for (i = 0; i < nthreads; i++)
 		cpu_set(i, tmp);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 
 	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
 
@@ -188,7 +188,7 @@ static int pseries_add_processor(struct device_node *np)
 	}
 	err = 0;
 out_unlock:
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -209,7 +209,7 @@ static void pseries_remove_processor(struct device_node *np)
 
 	nthreads = len / sizeof(u32);
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 	for (i = 0; i < nthreads; i++) {
 		for_each_present_cpu(cpu) {
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
@@ -223,7 +223,7 @@ static void pseries_remove_processor(struct device_node *np)
 			printk(KERN_WARNING "Could not find cpu to remove "
 			       "with physical id 0x%x\n", intserv[i]);
 	}
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 }
 
 static int pseries_smp_notifier(struct notifier_block *nb,
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index 73401c8..e3078ce 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
 {
 	int cpu;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	cpu = first_cpu(cpu_online_map);
 	for (;;) {
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
 		set_cpus_allowed(current, CPU_MASK_ALL);
 
 		/* Drop hotplug lock, and sleep for the specified delay */
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		msleep_interruptible(delay);
-		lock_cpu_hotplug();
+		get_online_cpus();
 
 		cpu = next_cpu(cpu, cpu_online_map);
 		if (cpu == NR_CPUS)
 			break;
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static int rtasd(void *unused)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3b20613..beb45c9 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	replace = -1;
 
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	/*  Search for existing MTRR  */
 	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	error = i;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 
@@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&mtrr_mutex);
 	if (reg < 0) {
 		/*  Search for existing MTRR  */
@@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	error = reg;
  out:
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 }
 /**
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c3152..40cfd54 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		return -EINVAL;
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
 	user_buffer = (void __user *) buf;
@@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
@@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 
 		old = current->cpus_allowed;
 
-		lock_cpu_hotplug();
+		get_online_cpus();
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 		mutex_lock(&microcode_mutex);
 		if (uci->valid)
 			err = cpu_request_microcode(cpu);
 		mutex_unlock(&microcode_mutex);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		set_cpus_allowed(current, old);
 	}
 	if (err)
@@ -817,9 +817,9 @@ static int __init microcode_init (void)
 		return PTR_ERR(microcode_pdev);
 	}
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (error) {
 		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
@@ -839,9 +839,9 @@ static void __exit microcode_exit (void)
 
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	platform_device_unregister(microcode_pdev);
 }
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 482aec2..96d0fd0 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)
 
 	/* We don't need the complexity of CPUs coming and going while we're
 	 * doing this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_has_pge) { /* We have a broader idea of "global". */
 		/* Remember that this was originally set (for cleanup). */
 		cpu_had_pge = 1;
@@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 };
 /*:*/
 
 void __exit lguest_arch_host_fini(void)
 {
 	/* If we had PGE before we started, turn it back on now. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 
diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c
index 5322e5e..9dc77f1 100644
--- a/drivers/s390/char/sclp_config.c
+++ b/drivers/s390/char/sclp_config.c
@@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
 	struct sys_device *sysdev;
 
 	printk(KERN_WARNING TAG "cpu capability changed.\n");
-	lock_cpu_hotplug();
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b79c575..e0132cb 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -71,18 +71,27 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 
 int cpu_up(unsigned int cpu);
 
+extern void cpu_hotplug_init(void);
+
 #else
 
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
 
+static inline void cpu_hotplug_init(void)
+{
+}
+
 #endif /* CONFIG_SMP */
 extern struct sysdev_class cpu_sysdev_class;
+extern void cpu_maps_update_begin(void);
+extern void cpu_maps_update_done(void);
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
@@ -97,8 +106,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 	mutex_unlock(cpu_hp_mutex);
 }
 
-extern void lock_cpu_hotplug(void);
-extern void unlock_cpu_hotplug(void);
+extern void get_online_cpus(void);
+extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
@@ -116,8 +125,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
 static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 { }
 
-#define lock_cpu_hotplug()	do { } while (0)
-#define unlock_cpu_hotplug()	do { } while (0)
+#define get_online_cpus()	do { } while (0)
+#define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 0c40cc0..5dfbc68 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
-#define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
-#define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
-#define CPU_DYING		0x000A /* CPU (unsigned)v not running any task,
+#define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
 				        * not handling interrupts, soon dead */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee800e7..3e4f306 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1467,6 +1467,10 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+extern unsigned int sysctl_sched_min_bal_int_shares;
+extern unsigned int sysctl_sched_max_bal_int_shares;
+#endif
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/init/main.c b/init/main.c
index 80b04b6..f287ca5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	cpuset_init_early();
 	mem_init();
+	cpu_hotplug_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c1..e0d3a4f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
-/* This protects CPUs going up and down... */
+/* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
-static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
  */
 static int cpu_hotplug_disabled;
 
-#ifdef CONFIG_HOTPLUG_CPU
+static struct {
+	struct task_struct *active_writer;
+	struct mutex lock; /* Synchronizes accesses to refcount, */
+	/*
+	 * Also blocks the new readers during
+	 * an ongoing cpu hotplug operation.
+	 */
+	int refcount;
+	wait_queue_head_t writer_queue;
+} cpu_hotplug;
 
-/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-static struct task_struct *recursive;
-static int recursive_depth;
+#define writer_exists() (cpu_hotplug.active_writer != NULL)
 
-void lock_cpu_hotplug(void)
+void __init cpu_hotplug_init(void)
 {
-	struct task_struct *tsk = current;
-
-	if (tsk == recursive) {
-		static int warnings = 10;
-		if (warnings) {
-			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
-			WARN_ON(1);
-			warnings--;
-		}
-		recursive_depth++;
+	cpu_hotplug.active_writer = NULL;
+	mutex_init(&cpu_hotplug.lock);
+	cpu_hotplug.refcount = 0;
+	init_waitqueue_head(&cpu_hotplug.writer_queue);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void get_online_cpus(void)
+{
+	might_sleep();
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	mutex_lock(&cpu_bitmask_lock);
-	recursive = tsk;
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount++;
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);
 
-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
 {
-	WARN_ON(recursive != current);
-	if (recursive_depth) {
-		recursive_depth--;
+	if (cpu_hotplug.active_writer == current)
 		return;
-	}
-	recursive = NULL;
-	mutex_unlock(&cpu_bitmask_lock);
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount--;
+
+	if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
+		wake_up(&cpu_hotplug.writer_queue);
+
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+	mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+	mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_maps_update_begin is always called after invoking
+ * cpu_maps_update_begin, we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ *   writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ *   non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * get_online_cpus() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	mutex_lock(&cpu_hotplug.lock);
+
+	cpu_hotplug.active_writer = current;
+	add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
+	while (cpu_hotplug.refcount) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&cpu_hotplug.lock);
+		schedule();
+		mutex_lock(&cpu_hotplug.lock);
+	}
+	remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
+}
+
+static void cpu_hotplug_done(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_unlock(&cpu_hotplug.lock);
+}
 /* Need to know about CPUs going up/down? */
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return ret;
 }
 
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	cpu_hotplug_begin();
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
-	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 	return err;
 }
 
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
 {
 	int err = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_down(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	cpu_hotplug_begin();
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	}
 
 	/* Arch-specific enabling code. */
-	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 
 	return ret;
 }
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 	else
 		err = _cpu_up(cpu, 0);
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 }
 
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error = 0;
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	first_cpu = first_cpu(cpu_online_map);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return error;
 }
 
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
 	int cpu, error;
 
 	/* Allow everyone to use the CPU hotplug again */
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	if (cpus_empty(frozen_cpus))
 		goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
 	}
 	cpus_clear(frozen_cpus);
 out:
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc4..cfaf641 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  *
  * Call with cgroup_mutex held.  May take callback_mutex during
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
  * Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex.  Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
 
 rebuild:
 	/* Have scheduler rebuild sched domains */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	partition_sched_domains(ndoms, doms);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 done:
 	if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
  *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The lock_cpu_hotplug()
+ * will call rebuild_sched_domains().  The get_online_cpus()
  * call in rebuild_sched_domains() must not be made while holding
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
+ * get_online_cpus() calls.  So the reverse nesting would risk an
  * ABBA deadlock.
  */
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c..fd59982 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return;
 	}
 
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
 	else
 		rcu_idle_cpu--;
 
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/sched.c b/kernel/sched.c
index 38933ca..950d0d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,9 +168,43 @@ struct task_group {
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
+
+	/*
+	 * shares assigned to a task group governs how much of cpu bandwidth
+	 * is allocated to the group. The more shares a group has, the more is
+	 * the cpu bandwidth allocated to it.
+	 *
+	 * For ex, lets say that there are three task groups, A, B and C which
+	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
+	 * should be:
+	 *
+	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+	 * 	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *
+	 * The weight assigned to a task group's schedulable entities on every
+	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+	 * group's shares. For ex: lets say that task group A has been
+	 * assigned shares of 1000 and there are two CPUs in a system. Then,
+	 *
+	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+	 *
+	 * Note: It's not necessary that each of a task's group schedulable
+	 * 	 entity have the same weight on all CPUs. If the group
+	 * 	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 * 	 better distribution of weight could be:
+	 *
+	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
+	 *
+	 * rebalance_shares() is responsible for distributing the shares of a
+	 * task groups like this among the group's schedulable entities across
+	 * cpus.
+	 *
+	 */
 	unsigned long shares;
-	/* spinlock to serialize modification to shares */
-	spinlock_t lock;
+
 	struct rcu_head rcu;
 };
 
@@ -182,6 +216,22 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
+/* task_group_mutex serializes add/remove of task groups and also changes to
+ * a task group's cpu shares.
+ */
+static DEFINE_MUTEX(task_group_mutex);
+
+/* doms_cur_mutex serializes access to doms_cur[] array */
+static DEFINE_MUTEX(doms_cur_mutex);
+
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
@@ -191,12 +241,14 @@ struct task_group init_task_group = {
 };
 
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	2*NICE_0_LOAD
 #else
-# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-static int init_task_group_load = INIT_TASK_GRP_LOAD;
+#define MIN_GROUP_SHARES       1
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
@@ -222,9 +274,33 @@ static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
 	p->se.parent = task_group(p)->se[cpu];
 }
 
+static inline void lock_task_group_list(void)
+{
+	mutex_lock(&task_group_mutex);
+}
+
+static inline void unlock_task_group_list(void)
+{
+	mutex_unlock(&task_group_mutex);
+}
+
+static inline void lock_doms_cur(void)
+{
+	mutex_lock(&doms_cur_mutex);
+}
+
+static inline void unlock_doms_cur(void)
+{
+	mutex_unlock(&doms_cur_mutex);
+}
+
 #else
 
 static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void lock_task_group_list(void) { }
+static inline void unlock_task_group_list(void) { }
+static inline void lock_doms_cur(void) { }
+static inline void unlock_doms_cur(void) { }
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
@@ -363,7 +439,6 @@ struct rq {
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
@@ -854,6 +929,16 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct rq_iterator *iterator);
 #endif
 
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_sub(&rq->load, load);
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -864,41 +949,14 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 #define sched_class_highest (&rt_sched_class)
 
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->load
- * and when switching tasks.
- */
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -4070,10 +4128,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4083,7 +4139,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -4473,13 +4528,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		mutex_unlock(&sched_hotcpu_mutex);
+		put_online_cpus();
 		return -ESRCH;
 	}
 
@@ -4519,7 +4574,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	}
 out_unlock:
 	put_task_struct(p);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -4576,7 +4631,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	struct task_struct *p;
 	int retval;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 
 	retval = -ESRCH;
@@ -4592,7 +4647,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return retval;
 }
@@ -5547,9 +5602,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	struct rq *rq;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&sched_hotcpu_mutex);
-		break;
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -5617,9 +5669,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&sched_hotcpu_mutex);
-		break;
 	}
 	return NOTIFY_OK;
 }
@@ -6524,6 +6573,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 	int i, j;
 
+	lock_doms_cur();
+
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
@@ -6564,6 +6615,8 @@ match2:
 	ndoms_cur = ndoms_new;
 
 	register_sched_domain_sysctl();
+
+	unlock_doms_cur();
 }
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6571,10 +6624,10 @@ static int arch_reinit_sched_domains(void)
 {
 	int err;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 	return err;
 }
@@ -6685,12 +6738,12 @@ void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 
@@ -6698,6 +6751,18 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+					 "load_balance_monitor");
+	if (!IS_ERR(lb_monitor_task)) {
+		lb_monitor_task->flags |= PF_NOFREEZE;
+		wake_up_process(lb_monitor_task);
+	} else {
+		printk(KERN_ERR "Could not create load balance monitor thread"
+			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
+	}
+#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -6762,7 +6827,6 @@ void __init sched_init(void)
 			se->parent = NULL;
 		}
 		init_task_group.shares = init_task_group_load;
-		spin_lock_init(&init_task_group.lock);
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6954,6 +7018,157 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+#ifdef CONFIG_SMP
+/*
+ * distribute shares of all task groups among their schedulable entities,
+ * to reflect load distrbution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(this_cpu);
+	cpumask_t sdspan = sd->span;
+	int balanced = 1;
+
+	/* Walk thr' all the task groups that we have */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		int i;
+		unsigned long total_load = 0, total_shares;
+		struct task_group *tg = cfs_rq->tg;
+
+		/* Gather total task load of this group across cpus */
+		for_each_cpu_mask(i, sdspan)
+			total_load += tg->cfs_rq[i]->load.weight;
+
+		/* Nothing to do if this group has no load  */
+		if (!total_load)
+			continue;
+
+		/*
+		 * tg->shares represents the number of cpu shares the task group
+		 * is eligible to hold on a single cpu. On N cpus, it is
+		 * eligible to hold (N * tg->shares) number of cpu shares.
+		 */
+		total_shares = tg->shares * cpus_weight(sdspan);
+
+		/*
+		 * redistribute total_shares across cpus as per the task load
+		 * distribution.
+		 */
+		for_each_cpu_mask(i, sdspan) {
+			unsigned long local_load, local_shares;
+
+			local_load = tg->cfs_rq[i]->load.weight;
+			local_shares = (local_load * total_shares) / total_load;
+			if (!local_shares)
+				local_shares = MIN_GROUP_SHARES;
+			if (local_shares == tg->se[i]->load.weight)
+				continue;
+
+			spin_lock_irq(&cpu_rq(i)->lock);
+			set_se_shares(tg->se[i], local_shares);
+			spin_unlock_irq(&cpu_rq(i)->lock);
+			balanced = 0;
+		}
+	}
+
+	return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate tradeoff between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+/* kernel thread that runs rebalance_shares() periodically */
+static int load_balance_monitor(void *unused)
+{
+	unsigned int timeout = sysctl_sched_min_bal_int_shares;
+	struct sched_param schedparm;
+	int ret;
+
+	/*
+	 * We don't want this thread's execution to be limited by the shares
+	 * assigned to default group (init_task_group). Hence make it run
+	 * as a SCHED_RR RT task at the lowest priority.
+	 */
+	schedparm.sched_priority = 1;
+	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
+	if (ret)
+		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
+				" monitor thread (error = %d) \n", ret);
+
+	while (!kthread_should_stop()) {
+		int i, cpu, balanced = 1;
+
+		/* Prevent cpus going down or coming up */
+		get_online_cpus();
+		/* lockout changes to doms_cur[] array */
+		lock_doms_cur();
+		/*
+		 * Enter a rcu read-side critical section to safely walk rq->sd
+		 * chain on various cpus and to walk task group list
+		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
+		 */
+		rcu_read_lock();
+
+		for (i = 0; i < ndoms_cur; i++) {
+			cpumask_t cpumap = doms_cur[i];
+			struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+			cpu = first_cpu(cpumap);
+
+			/* Find the highest domain at which to balance shares */
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_LOAD_BALANCE))
+					continue;
+				sd_prev = sd;
+			}
+
+			sd = sd_prev;
+			/* sd == NULL? No load balance reqd in this domain */
+			if (!sd)
+				continue;
+
+			balanced &= rebalance_shares(sd, cpu);
+		}
+
+		rcu_read_unlock();
+
+		unlock_doms_cur();
+		put_online_cpus();
+
+		if (!balanced)
+			timeout = sysctl_sched_min_bal_int_shares;
+		else if (timeout < sysctl_sched_max_bal_int_shares)
+			timeout *= 2;
+
+		msleep_interruptible(timeout);
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_SMP */
+
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 {
@@ -7002,14 +7217,15 @@ struct task_group *sched_create_group(void)
 		se->parent = NULL;
 	}
 
+	tg->shares = NICE_0_LOAD;
+
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	}
-
-	tg->shares = NICE_0_LOAD;
-	spin_lock_init(&tg->lock);
+	unlock_task_group_list();
 
 	return tg;
 
@@ -7055,10 +7271,12 @@ void sched_destroy_group(struct task_group *tg)
 	struct cfs_rq *cfs_rq = NULL;
 	int i;
 
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 	}
+	unlock_task_group_list();
 
 	BUG_ON(!cfs_rq);
 
@@ -7107,41 +7325,79 @@ done:
 	task_rq_unlock(rq, &flags);
 }
 
+/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
+	if (!shares)
+		shares = MIN_GROUP_SHARES;
 
 	on_rq = se->on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_entity(cfs_rq, se, 0);
+		dec_cpu_load(rq, se->load.weight);
+	}
 
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 
-	if (on_rq)
+	if (on_rq) {
 		enqueue_entity(cfs_rq, se, 0);
-
-	spin_unlock_irq(&rq->lock);
+		inc_cpu_load(rq, se->load.weight);
+	}
 }
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 
-	spin_lock(&tg->lock);
+	lock_task_group_list();
 	if (tg->shares == shares)
 		goto done;
 
+	if (shares < MIN_GROUP_SHARES)
+		shares = MIN_GROUP_SHARES;
+
+	/*
+	 * Prevent any load balance activity (rebalance_shares,
+	 * load_balance_fair) from referring to this group first,
+	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
+	 */
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
+
+	/* wait for any ongoing reference to this group to finish */
+	synchronize_sched();
+
+	/*
+	 * Now we are free to modify the group's share on each cpu
+	 * w/o tripping rebalance_share or load_balance_fair.
+	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		spin_lock_irq(&cpu_rq(i)->lock);
 		set_se_shares(tg->se[i], shares);
+		spin_unlock_irq(&cpu_rq(i)->lock);
+	}
 
+	/*
+	 * Enable load balance activity on this group, by inserting it back on
+	 * each cpu's rq->leaf_cfs_rq_list.
+	 */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
 done:
-	spin_unlock(&tg->lock);
+	unlock_task_group_list();
 	return 0;
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2f16e15..79b5fed 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -685,7 +685,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline int
@@ -702,6 +702,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
+#define GROUP_IMBALANCE_PCT	20
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -755,15 +757,26 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL;	/* Highest schedulable entity */
+	int incload = 1;
 
 	for_each_sched_entity(se) {
-		if (se->on_rq)
+		topse = se;
+		if (se->on_rq) {
+			incload = 0;
 			break;
+		}
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
+	/* Increment cpu load if we just enqueued the first task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (incload)
+		inc_cpu_load(rq, topse->load.weight);
 }
 
 /*
@@ -774,16 +787,28 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL; 	/* Highest schedulable entity */
+	int decload = 1;
 
 	for_each_sched_entity(se) {
+		topse = se;
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		if (cfs_rq->load.weight) {
+			if (parent_entity(se))
+				decload = 0;
 			break;
+		}
 		sleep = 1;
 	}
+	/* Decrement cpu load if we just dequeued the last task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (decload)
+		dec_cpu_load(rq, topse->load.weight);
 }
 
 /*
@@ -938,25 +963,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running)
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
-
-	p = task_of(curr);
-
-	return p->prio;
-}
-#endif
-
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -966,28 +972,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
+	unsigned long load_moved;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
-		long imbalance;
-		unsigned long maxload;
+		struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+		unsigned long maxload, task_load, group_weight;
+		unsigned long thisload, per_task_load;
+		struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		task_load = busy_cfs_rq->load.weight;
+		group_weight = se->load.weight;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		/*
+		 * 'group_weight' is contributed by tasks of total weight
+		 * 'task_load'. To move 'rem_load_move' worth of weight only,
+		 * we need to move a maximum task load of:
+		 *
+		 * 	maxload = (remload / group_weight) * task_load;
+		 */
+		maxload = (rem_load_move * task_load) / group_weight;
+
+		if (!maxload || !task_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		per_task_load = task_load / busy_cfs_rq->nr_running;
+		/*
+		 * balance_tasks will try to forcibly move atleast one task if
+		 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+		 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+		 */
+		 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+			continue;
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+		/* Disable priority-based load balance */
+		*this_best_prio = 0;
+		thisload = this_cfs_rq->load.weight;
 #else
 # define maxload rem_load_move
 #endif
@@ -996,11 +1019,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+		load_moved = balance_tasks(this_rq, this_cpu, busiest,
 					       maxload, sd, idle, all_pinned,
 					       this_best_prio,
 					       &cfs_rq_iterator);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		/*
+		 * load_moved holds the task load that was moved. The
+		 * effective (group) weight moved would be:
+		 * 	load_moved_eff = load_moved/task_load * group_weight;
+		 */
+		load_moved = (group_weight * load_moved) / task_load;
+
+		/* Adjust shares on both cpus to reflect load_moved */
+		group_weight -= load_moved;
+		set_se_shares(se, group_weight);
+
+		se = busy_cfs_rq->tg->se[this_cpu];
+		if (!thisload)
+			group_weight = load_moved;
+		else
+			group_weight = se->load.weight + load_moved;
+		set_se_shares(se, group_weight);
+#endif
+
+		rem_load_move -= load_moved;
+
 		if (rem_load_move <= 0)
 			break;
 	}
@@ -1126,7 +1171,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
+	lock_task_group_list();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
+	unlock_task_group_list();
 }
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8abd752..c591fbb 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -31,6 +31,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 /*
@@ -45,6 +46,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821e..51b5ee5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	int ret;
 
 	/* No CPUs can come up or down during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	p = __stop_machine_run(fn, data, cpu);
 	if (!IS_ERR(p))
 		ret = kthread_stop(p);
 	else
 		ret = PTR_ERR(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 	return ret;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0deed82..ab3019e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = {
 		.mode		= 644,
 		.proc_handler	= &proc_dointvec,
 	},
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_min_bal_int_shares",
+		.data           = &sysctl_sched_min_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_max_bal_int_shares",
+		.data           = &sysctl_sched_max_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52d5e7c..1bddee3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
 #endif
 };
 
-/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
-   threads to each one as cpus come/go. */
-static DEFINE_MUTEX(workqueue_mutex);
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 
 static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  * Returns zero on success.
  * Returns -ve errno on failure.
  *
- * Appears to be racy against CPU hotplug.
- *
  * schedule_on_each_cpu() is very slow.
  */
 int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	preempt_disable();		/* CPU hotplug */
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	preempt_enable();
 	flush_workqueue(keventd_wq);
+	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
@@ -749,8 +746,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 	} else {
-		mutex_lock(&workqueue_mutex);
+		get_online_cpus();
+		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
 
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
@@ -759,7 +758,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
-		mutex_unlock(&workqueue_mutex);
+		put_online_cpus();
 	}
 
 	if (err) {
@@ -774,7 +773,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	/*
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * workqueue_mutex protects cwq->thread
+	 * get_online_cpus() protects cwq->thread.
 	 */
 	if (cwq->thread == NULL)
 		return;
@@ -809,9 +808,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	struct cpu_workqueue_struct *cwq;
 	int cpu;
 
-	mutex_lock(&workqueue_mutex);
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
-	mutex_unlock(&workqueue_mutex);
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -834,13 +835,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	action &= ~CPU_TASKS_FROZEN;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&workqueue_mutex);
-		return NOTIFY_OK;
-
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&workqueue_mutex);
-		return NOTIFY_OK;
 
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
@@ -853,7 +847,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		case CPU_UP_PREPARE:
 			if (!create_workqueue_thread(cwq, cpu))
 				break;
-			printk(KERN_ERR "workqueue for %i failed\n", cpu);
+			printk(KERN_ERR "workqueue [%s] for %i failed\n",
+				wq->name, cpu);
 			return NOTIFY_BAD;
 
 		case CPU_ONLINE:
diff --git a/mm/slab.c b/mm/slab.c
index c31cd36..2d20a6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
 #endif
 
 /*
- * 1. Guard access to the cache-chain.
- * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ * Guard access to the cache-chain.
  */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 	int err = 0;
 
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&cache_chain_mutex);
-		break;
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		err = cpuup_prepare(cpu);
+		mutex_unlock(&cache_chain_mutex);
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		cpuup_canceled(cpu);
-		break;
-	case CPU_LOCK_RELEASE:
 		mutex_unlock(&cache_chain_mutex);
 		break;
 	}
@@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_map as well.  Please see cpuup_callback
 	 */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 
 	list_for_each_entry(pc, &cache_chain, next) {
@@ -2396,6 +2394,7 @@ oops:
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	int ret;
 	BUG_ON(!cachep || in_interrupt());
 
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	ret = __cache_shrink(cachep);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 	BUG_ON(!cachep || in_interrupt());
 
 	/* Find the cache in the chain of caches. */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
@@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 		slab_error(cachep, "Can't free all objects");
 		list_add(&cachep->next, &cache_chain);
 		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
 		return;
 	}
 
@@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 
 	__kmem_cache_destroy(cachep);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
diff --git a/net/core/flow.c b/net/core/flow.c
index 3ed2b4b..6489f4e 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -293,7 +293,7 @@ void flow_cache_flush(void)
 	static DEFINE_MUTEX(flow_flush_sem);
 
 	/* Don't want cpus going down or up during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
@@ -305,7 +305,7 @@ void flow_cache_flush(void)
 
 	wait_for_completion(&info.completion);
 	mutex_unlock(&flow_flush_sem);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 
 static void __devinit flow_cache_cpu_prepare(int cpu)