===== kernel/sched.c 1.224 vs edited =====
--- 1.224/kernel/sched.c	2004-07-12 18:32:52 -04:00
+++ edited/kernel/sched.c	2004-07-23 15:31:40 -04:00
@@ -3699,26 +3699,97 @@
 #ifdef CONFIG_NUMA
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
+
+#define SD_CPUS_PER_NODE 64
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start at @node */
+		n = (node + i) % numnodes;
+
+		/* Skip already used nodes */
+		if (test_bit(used_nodes, n))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, i);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(used_nodes, best_node);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ *
+ * Note that this *should* be heirarchical rather than flat, i.e. the
+ * domain above single CPUs should only span nodes or physical chassis, and
+ * a domain above that should contain a larger number of CPUs, though
+ * probably not all of the available ones.
+ */
+static cpumask_t __init sched_domain_node_span(int node)
+{
+	cpumask_t span;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpu_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	for (i = 0; i < SD_MAX_NODES; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		cpu_set(span, node_to_cpumask(next_node));
+	}
+	return span;
+}
+
 static void __init arch_init_sched_domains(void)
 {
 	int i;
 	struct sched_group *first_node = NULL, *last_node = NULL;
 
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_domain *node_sd = &per_cpu(node_domains, i);
+
+		*node_sd = SD_NODE_INIT;
+		node_sd->span = sched_domain_node_span(i);
+		node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
+	}
+
 	/* Set up domains */
 	for_each_cpu(i) {
 		int node = cpu_to_node(i);
 		cpumask_t nodemask = node_to_cpumask(node);
-		struct sched_domain *node_sd = &per_cpu(node_domains, i);
 		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
 
-		*node_sd = SD_NODE_INIT;
-		node_sd->span = cpu_possible_map;
-		node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
-
 		*cpu_sd = SD_CPU_INIT;
 		cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
 		cpu_sd->groups = &sched_group_cpus[i];
-		cpu_sd->parent = node_sd;
+		cpu_sd->parent = &per_cpu(node_domains, i);
 	}
 
 	/* Set up groups */