From: Andrew Morton <akpm@osdl.org>

schedule_on_each_cpu() presently does a large kmalloc - 96 kbytes on 1024 CPU
64-bit.

Rework it so that we do one 8192-byte allocation and then a pile of tiny ones,
via alloc_percpu().  This has a much higher chance of success (100% in the
current VM).

This also has the effect of reducing the memory requirements from NR_CPUS*n to
num_possible_cpus()*n.

Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 kernel/workqueue.c |   28 ++++++++++++++++++++--------
 1 files changed, 20 insertions(+), 8 deletions(-)

diff -puN kernel/workqueue.c~schedule_on_each_cpu-reduce-kmalloc-size kernel/workqueue.c
--- devel/kernel/workqueue.c~schedule_on_each_cpu-reduce-kmalloc-size	2006-05-22 15:07:56.000000000 -0700
+++ devel-akpm/kernel/workqueue.c	2006-05-22 16:26:54.000000000 -0700
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
 	return ret;
 }
 
-int schedule_on_each_cpu(void (*func) (void *info), void *info)
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info)
 {
 	int cpu;
-	struct work_struct *work;
+	struct work_struct *works;
 
-	work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
-
-	if (!work)
+	works = alloc_percpu(struct work_struct);
+	if (!works)
 		return -ENOMEM;
+
 	for_each_online_cpu(cpu) {
-		INIT_WORK(work + cpu, func, info);
+		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
-				work + cpu);
+				per_cpu_ptr(works, cpu));
 	}
 	flush_workqueue(keventd_wq);
-	kfree(work);
+	free_percpu(works);
 	return 0;
 }
 
_