Index: linux-2.6.19-rc2-mm1/include/linux/sched.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/linux/sched.h 2006-10-18 17:36:36.586477666 -0500 +++ linux-2.6.19-rc2-mm1/include/linux/sched.h 2006-10-18 17:37:04.960004655 -0500 @@ -656,7 +656,6 @@ enum idle_type struct sched_group { - struct sched_group *next; /* Must be a circular list */ cpumask_t cpumask; /* @@ -671,6 +670,7 @@ struct sched_domain { struct sched_domain *parent; /* top domain must be null terminated */ struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ + struct sched_group *lastgroup; /* Last sched_group of the domain */ cpumask_t span; /* span of all CPUs in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ Index: linux-2.6.19-rc2-mm1/kernel/sched.c =================================================================== --- linux-2.6.19-rc2-mm1.orig/kernel/sched.c 2006-10-18 17:36:36.617730346 -0500 +++ linux-2.6.19-rc2-mm1/kernel/sched.c 2006-10-18 17:55:58.947794258 -0500 @@ -1184,19 +1184,19 @@ static inline unsigned long cpu_avg_load static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *this = NULL, *group; unsigned long min_load = ULONG_MAX, this_load = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; - do { + for (group = sd->groups; group < sd->lastgroup; group++) { unsigned long load, avg_load; int local_group; int i; /* Skip over this group if it has no CPUs allowed */ if (!cpus_intersects(group->cpumask, p->cpus_allowed)) - goto nextgroup; + continue; local_group = cpu_isset(this_cpu, group->cpumask); @@ -1223,9 +1223,7 @@ find_idlest_group(struct sched_domain *s min_load = avg_load; idlest = group; } -nextgroup: - group = group->next; - } while (group != sd->groups); + } if (!idlest || 100*this_load < imbalance*min_load) return NULL; @@ -2233,7 +2231,7 @@ find_busiest_group(struct sched_domain * unsigned long *imbalance, enum idle_type idle, int *sd_idle, cpumask_t *cpus) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *busiest = NULL, *this = NULL, *group; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; @@ -2256,7 +2254,7 @@ find_busiest_group(struct sched_domain * else load_idx = sd->idle_idx; - do { + for (group = sd->groups; group < sd->lastgroup; group++) { unsigned long load, group_capacity; int local_group; int i; @@ -2316,7 +2314,7 @@ find_busiest_group(struct sched_domain * * balance. */ if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - goto group_next; + continue; /* * If the local group is idle or completely loaded @@ -2332,7 +2330,7 @@ find_busiest_group(struct sched_domain * */ if (!power_savings_balance || sum_nr_running >= group_capacity || !sum_nr_running) - goto group_next; + continue; /* * Calculate the group which has the least non-idle load. @@ -2363,10 +2361,8 @@ find_busiest_group(struct sched_domain * leader_nr_running = sum_nr_running; } } -group_next: #endif - group = group->next; - } while (group != sd->groups); + } if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; @@ -5481,7 +5477,7 @@ static void sched_domain_debug(struct sc for (i = 0; i < level + 2; i++) printk(" "); printk("groups:"); - do { + for (group = sd->groups; group < sd->lastgroup; group++) { if (!group) { printk("\n"); printk(KERN_ERR "ERROR: group is NULL\n"); @@ -5507,9 +5503,7 @@ static void sched_domain_debug(struct sc cpumask_scnprintf(str, NR_CPUS, group->cpumask); printk(" %s", str); - - group = group->next; - } while (group != sd->groups); + } printk("\n"); if (!cpus_equal(sd->span, groupmask)) @@ -5541,7 +5535,7 @@ static int sd_degenerate(struct sched_do SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) + if (sd->groups + 1 != sd->lastgroup) return 0; } @@ -5570,7 +5564,7 @@ sd_parent_degenerate(struct sched_domain if (cflags & SD_WAKE_AFFINE) pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { + if (parent->groups + 1== parent->lastgroup) { pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | @@ -5637,48 +5631,53 @@ __setup ("isolcpus=", isolated_cpu_setup /* * init_sched_build_groups takes an array of groups, the cpumask we wish * to span, and a pointer to a function which identifies what group a CPU - * belongs to. The return value of group_fn must be a valid index into the - * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we - * keep track of groups covered with a cpumask_t). + * belongs to. The return value of group_fn must be a unique value correlated + * to the return valuye of group_fn() to allow the identification of members + * of the group. * - * init_sched_build_groups will build a circular linked list of the groups + * init_sched_build_groups will initialize an array of the groups * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ static void -init_sched_build_groups(struct sched_group groups[], cpumask_t span, +init_sched_build_groups(struct sched_domain *sd, const cpumask_t *cpu_map, - int (*group_fn)(int cpu, const cpumask_t *cpu_map)) + int (*group_fn)(int cpu, const cpumask_t *cpu_map), + int start_cpu) { - struct sched_group *first = NULL, *last = NULL; + struct sched_group *groups; cpumask_t covered = CPU_MASK_NONE; - int i; + cpumask_t span = sd->span; + int i = start_cpu; + + groups = kmalloc_node(sizeof(struct sched_group) * cpus_weight(span), + GFP_KERNEL, page_to_nid(virt_to_page(sd))); + BUG_ON(!groups); + sd->groups = groups; - for_each_cpu_mask(i, span) { + do { int group = group_fn(i, cpu_map); - struct sched_group *sg = &groups[group]; int j; + if (!cpu_isset(i, span)) + continue; if (cpu_isset(i, covered)) continue; - sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; + groups->cpumask = CPU_MASK_NONE; + groups->cpu_power = 0; for_each_cpu_mask(j, span) { if (group_fn(j, cpu_map) != group) continue; cpu_set(j, covered); - cpu_set(j, sg->cpumask); + cpu_set(j, groups->cpumask); } - if (!first) - first = sg; - if (last) - last->next = sg; - last = sg; - } - last->next = first; + groups++; + i = ((i + 1) % NR_CPUS); + } while (i != start_cpu); + sd->lastgroup = groups; } #define SD_NODES_PER_DOMAIN 16 @@ -6237,13 +6236,40 @@ static cpumask_t sched_domain_node_span( int sched_smt_power_savings = 0, sched_mc_power_savings = 0; +enum sd_types = { +#ifdef CONFIG_SCHED_SMT + CPU_DOMAINS, +#endif +#ifdef CONFIG_SCHED_MC + CORE_DOMAIN, +#endif + PHYS_DOMAINS, +#ifdef CONFIG_NUMA + NODE_DOMAINS, + ALLNODES_DOMAINS, +#endif + NR_DOMAINS +}; + +static DEFINE_PER_CPU(struct sched_domain, coredomains)[NR_DOMAINS]; + +static int (*func)(int, const cpumask_t)[NR_DOMAINS] = { +#ifdef CONFIG_SCHED_SMT + cpu_to_cpu_group, +#endif +#ifdef CONFIG_SCHED_MC + cpu_to_core_group, +#endif + cpu_to_phys_group, +#ifdef CONFIG_NUMA + cpu_to_allnodes_group, + cpu_to_allnodes_group, +#endif + /* * SMT sched-domains: */ #ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; - static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) { return cpu; @@ -6253,11 +6279,6 @@ static int cpu_to_cpu_group(int cpu, con /* * multi-core sched-domains: */ -#ifdef CONFIG_SCHED_MC -static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; -#endif - #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) { @@ -6272,9 +6293,6 @@ static int cpu_to_core_group(int cpu, co } #endif -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; - static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) { #ifdef CONFIG_SCHED_MC @@ -6291,21 +6309,11 @@ static int cpu_to_phys_group(int cpu, co } #ifdef CONFIG_NUMA -/* - * The init_sched_build_groups can't handle what we want to do with node - * groups, so roll our own. Now each node has its own list of groups which - * gets dynamically allocated. - */ -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; - -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); -static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; - static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) { return cpu_to_node(cpu); } + static void init_numa_sched_groups_power(struct sched_group *group_head) { struct sched_group *sg = group_head; @@ -6334,53 +6342,20 @@ next_sg: } #endif -#ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ static void free_sched_groups(const cpumask_t *cpu_map) { int cpu, i; - for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - - if (!sched_group_nodes) - continue; - - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; + for_each_cpu_mask(cpu, *cpu_map) + for (i = 0; i < NR_DOMAINS; i++) { + sd = get_cpu_var(sched_domain, cpu) + i; + if (sd->group) { + kfree(sd->group); + sd->group = NULL; } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; } } -#else -static void free_sched_groups(const cpumask_t *cpu_map) -{ -} -#endif /* * Initialize sched groups cpu_power. @@ -6427,11 +6402,8 @@ static void init_sched_groups_power(int /* * add cpu_power of each child group to this groups cpu_power */ - group = child->groups; - do { + for (group = child->groups; group < child->lastgroup; group++) sd->groups->cpu_power += group->cpu_power; - group = group->next; - } while (group != child->groups); } /* @@ -6440,30 +6412,13 @@ static void init_sched_groups_power(int */ static int build_sched_domains(const cpumask_t *cpu_map) { + struct sched_domain *sd = NULL, *p; int i; - struct sched_domain *sd; -#ifdef CONFIG_NUMA - struct sched_group **sched_group_nodes = NULL; - struct sched_group *sched_group_allnodes = NULL; - - /* - * Allocate the per-node list of sched groups - */ - sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, - GFP_KERNEL); - if (!sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - return -ENOMEM; - } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; -#endif /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { - int group; - struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(nodemask, nodemask, *cpu_map); @@ -6471,26 +6426,12 @@ static int build_sched_domains(const cpu #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { - if (!sched_group_allnodes) { - sched_group_allnodes - = kmalloc_node(sizeof(struct sched_group) - * MAX_NUMNODES, - GFP_KERNEL, - cpu_to_node(i)); - if (!sched_group_allnodes) { - printk(KERN_WARNING - "Can not alloc allnodes sched group\n"); - goto error; - } - sched_group_allnodes_bycpu[i] - = sched_group_allnodes; - } sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; - group = cpu_to_allnodes_group(i, cpu_map); - sd->groups = &sched_group_allnodes[group]; p = sd; + init_sched_build_groups(&per_cpu(allnodes_domains, i), + cpu_map, &cpu_to_allnodes_group, i); } else p = NULL; @@ -6501,154 +6442,45 @@ static int build_sched_domains(const cpu if (p) p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); + if (!cpus_empty(nodemask)) + init_sched_build_groups(&per_cpu(node_domains, i), + cpu_map, &cpu_to_phys_group,i); #endif - p = sd; sd = &per_cpu(phys_domains, i); - group = cpu_to_phys_group(i, cpu_map); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; if (p) p->child = sd; - sd->groups = &sched_group_phys[group]; + if (!cpus_empty(nodemask)) + init_sched_build_groups(&per_cpu(node_domains, i), + cpu_map, &cpu_to_phys_group, i); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - group = cpu_to_core_group(i, cpu_map); *sd = SD_MC_INIT; sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_core[group]; + init_sched_build_groups(&per_cpu(core_domains, i), + cpu_map, &cpu_to_core_group,i); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - group = cpu_to_cpu_group(i, cpu_map); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - sd->groups = &sched_group_cpus[group]; + init_sched_build_groups(&per_cpu(cpu_domains, i), + cpu_map, &cpu_to_cpu_group,i); #endif } - -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) - continue; - - init_sched_build_groups(sched_group_cpus, this_sibling_map, - cpu_map, &cpu_to_cpu_group); - } -#endif - -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) - continue; - init_sched_build_groups(sched_group_core, this_core_map, - cpu_map, &cpu_to_core_group); - } -#endif - - - /* Set up physical groups */ - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - - init_sched_build_groups(sched_group_phys, nodemask, - cpu_map, &cpu_to_phys_group); - } - -#ifdef CONFIG_NUMA - /* Set up node groups */ - if (sched_group_allnodes) - init_sched_build_groups(sched_group_allnodes, *cpu_map, - cpu_map, &cpu_to_allnodes_group); - - for (i = 0; i < MAX_NUMNODES; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; - int j; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { - sched_group_nodes[i] = NULL; - continue; - } - - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); - goto error; - } - sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { - struct sched_domain *sd; - sd = &per_cpu(node_domains, j); - sd->groups = sg; - } - sg->cpu_power = 0; - sg->cpumask = nodemask; - sg->next = sg; - cpus_or(covered, covered, nodemask); - prev = sg; - - for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; - int n = (i + j) % MAX_NUMNODES; - - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) - break; - - nodemask = node_to_cpumask(n); - cpus_and(tmp, tmp, nodemask); - if (cpus_empty(tmp)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->cpu_power = 0; - sg->cpumask = tmp; - sg->next = prev->next; - cpus_or(covered, covered, tmp); - prev->next = sg; - prev = sg; - } - } -#endif - /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { @@ -6669,14 +6501,10 @@ static int build_sched_domains(const cpu } #ifdef CONFIG_NUMA + for_each_cpu_mask(i, *cpu_map) { for (i = 0; i < MAX_NUMNODES; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); - - if (sched_group_allnodes) { - int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); - struct sched_group *sg = &sched_group_allnodes[group]; - - init_numa_sched_groups_power(sg); + init_numa_sched_groups_power(&per_cpu(node_domains, i)); + init_numa_sched_groups_power(&per_cpu(allnodes_domains, i)); } #endif @@ -6698,13 +6526,8 @@ static int build_sched_domains(const cpu calibrate_migration_costs(cpu_map); return 0; - -#ifdef CONFIG_NUMA -error: - free_sched_groups(cpu_map); - return -ENOMEM; -#endif } + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ @@ -6727,7 +6550,6 @@ static int arch_init_sched_domains(const static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { - free_sched_groups(cpu_map); } /* Index: linux-2.6.19-rc2-mm1/include/asm-ia64/topology.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/asm-ia64/topology.h 2006-10-18 17:36:36.606010591 -0500 +++ linux-2.6.19-rc2-mm1/include/asm-ia64/topology.h 2006-10-18 17:37:05.006883676 -0500 @@ -60,7 +60,6 @@ void build_cpu_to_node_map(void); .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ .busy_factor = 64, \ @@ -86,7 +85,6 @@ void build_cpu_to_node_map(void); .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 8, \ .max_interval = 8*(min(num_online_cpus(), 32)), \ .busy_factor = 64, \ Index: linux-2.6.19-rc2-mm1/include/linux/topology.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/linux/topology.h 2006-10-18 17:36:36.596244128 -0500 +++ linux-2.6.19-rc2-mm1/include/linux/topology.h 2006-10-18 17:37:05.014696846 -0500 @@ -90,7 +90,6 @@ .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 2, \ .busy_factor = 8, \ @@ -122,7 +121,6 @@ .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ .busy_factor = 64, \ @@ -153,7 +151,6 @@ .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ .busy_factor = 64, \ @@ -181,7 +178,6 @@ .span = CPU_MASK_NONE, \ .parent = NULL, \ .child = NULL, \ - .groups = NULL, \ .min_interval = 64, \ .max_interval = 64*num_online_cpus(), \ .busy_factor = 128, \