--- linux-2.6-npiggin/arch/ia64/kernel/Makefile | 2 linux-2.6-npiggin/arch/ia64/kernel/domain.c | 341 +++++++++++++++++ linux-2.6-npiggin/include/asm-ia64/processor.h | 3 linux-2.6-npiggin/include/linux/notifier.h | 9 linux-2.6-npiggin/include/linux/sched.h | 148 +++++++ linux-2.6-npiggin/kernel/cpu.c | 9 linux-2.6-npiggin/kernel/sched.c | 493 ++++++------------------- 7 files changed, 638 insertions(+), 367 deletions(-) diff -puN arch/ia64/kernel/Makefile~rollup arch/ia64/kernel/Makefile --- linux-2.6/arch/ia64/kernel/Makefile~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/arch/ia64/kernel/Makefile 2004-09-09 13:20:16.000000000 +1000 @@ -14,7 +14,7 @@ obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o obj-$(CONFIG_IA64_PALINFO) += palinfo.o obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SMP) += smp.o smpboot.o +obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_KGDB) += kgdb_stub.o diff -puN /dev/null arch/ia64/kernel/domain.c --- /dev/null 2004-09-06 19:38:39.000000000 +1000 +++ linux-2.6-npiggin/arch/ia64/kernel/domain.c 2004-09-09 13:20:16.000000000 +1000 @@ -0,0 +1,341 @@ +/* + * arch/ia64/kernel/domain.c + * Architecture specific sched-domains builder. + * + * Copyright (C) 2004 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include + +#define SD_NODES_PER_DOMAIN 4 + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int __devinit find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, i); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t __devinit sched_domain_node_span(int node) +{ + int i; + cpumask_t span; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + for (i = 0; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + cpumask_t nodemask; + + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} + + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA +/* + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. + */ +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group *sched_group_nodes[MAX_NUMNODES]; +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +void __devinit arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + *sd = SD_NODE_INIT; + sd->span = sched_domain_node_span(cpu_to_node(i)); + cpus_and(sd->span, sd->span, cpu_default_map); +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu_mask(i, cpu_default_map) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, cpu_default_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, cpu_default_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + } + +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + + /* Attach the domains */ + for_each_online_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } +} + +void __devinit arch_destroy_sched_domains(void) +{ + int i; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + sched_group_nodes[i] = NULL; + } +} + diff -puN include/asm-ia64/processor.h~rollup include/asm-ia64/processor.h --- linux-2.6/include/asm-ia64/processor.h~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/include/asm-ia64/processor.h 2004-09-09 13:20:16.000000000 +1000 @@ -20,6 +20,9 @@ #include #include +/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */ +#define ARCH_HAS_SCHED_DOMAIN + #define IA64_NUM_DBG_REGS 8 /* * Limits for PMC and PMD are set to less than maximum architected values diff -puN include/linux/notifier.h~rollup include/linux/notifier.h --- linux-2.6/include/linux/notifier.h~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/include/linux/notifier.h 2004-09-09 13:20:16.000000000 +1000 @@ -60,10 +60,11 @@ extern int notifier_call_chain(struct no #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ -#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ -#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */ -#define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */ -#define CPU_DEAD 0x0006 /* CPU (unsigned)v dead */ +#define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ +#define CPU_UP_PREPARE 0x0003 /* CPU (unsigned)v coming up */ +#define CPU_UP_CANCELED 0x0004 /* CPU (unsigned)v NOT coming up */ +#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */ +#define CPU_DEAD 0x0006 /* CPU (unsigned)v dead */ #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff -puN include/linux/sched.h~rollup include/linux/sched.h --- linux-2.6/include/linux/sched.h~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/include/linux/sched.h 2004-09-09 13:20:16.000000000 +1000 @@ -399,6 +399,154 @@ struct sched_info { extern struct file_operations proc_schedstat_operations; #endif +enum idle_type +{ + IDLE, + NOT_IDLE, + NEWLY_IDLE, + MAX_IDLE_TYPES +}; + +/* + * sched-domains (multiprocessor balancing) declarations: + */ +#ifdef CONFIG_SMP +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 4 /* Balance on exec */ +#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This is read only (except for setup, hotplug CPU). + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + int flags; /* See SD_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + +#ifdef CONFIG_SCHEDSTATS + /* load_balance() stats */ + unsigned long lb_cnt[MAX_IDLE_TYPES]; + unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_nobusyg[MAX_IDLE_TYPES]; + unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + + /* sched_balance_exec() stats */ + unsigned long sbe_attempts; + unsigned long sbe_pushed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_wake_affine; + unsigned long ttwu_wake_balance; +#endif +}; + +#ifdef ARCH_HAS_SCHED_DOMAIN +/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ +extern cpumask_t cpu_isolated_map; +extern void init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)); +extern void cpu_attach_domain(struct sched_domain *sd, int cpu); +#endif + +#ifndef ARCH_HAS_SCHED_TUNE +#ifdef CONFIG_SCHED_SMT +#define ARCH_HAS_SCHED_WAKE_IDLE +/* Common values for SMT siblings */ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 8, \ + .imbalance_pct = 110, \ + .cache_hot_time = 0, \ + .cache_nice_tries = 0, \ + .per_cpu_gain = 25, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_SHARE_CPUPOWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif + +/* Common values for CPUs */ +#define SD_CPU_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_hot_time = (5*1000/2), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} + +#ifdef CONFIG_NUMA +#define SD_NODE_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .groups = NULL, \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_hot_time = (10*1000), \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .flags = SD_BALANCE_EXEC \ + | SD_WAKE_BALANCE, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif +#endif /* ARCH_HAS_SCHED_TUNE */ +#endif /* CONFIG_SMP */ + + struct io_context; /* See blkdev.h */ void exit_io_context(void); struct cpuset; diff -puN kernel/cpu.c~rollup kernel/cpu.c --- linux-2.6/kernel/cpu.c~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/kernel/cpu.c 2004-09-09 13:20:16.000000000 +1000 @@ -119,6 +119,15 @@ int cpu_down(unsigned int cpu) goto out; } + err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + (void *)(long)cpu); + if (err == NOTIFY_BAD) { + printk("%s: attempt to take down CPU %u failed\n", + __FUNCTION__, cpu); + err = -EINVAL; + goto out; + } + /* Ensure that we are not runnable on dying cpu */ old_allowed = current->cpus_allowed; tmp = CPU_MASK_ALL; diff -puN kernel/sched.c~rollup kernel/sched.c --- linux-2.6/kernel/sched.c~rollup 2004-09-09 13:20:16.000000000 +1000 +++ linux-2.6-npiggin/kernel/sched.c 2004-09-09 13:20:16.000000000 +1000 @@ -119,16 +119,6 @@ int sched_max_base = 10000; #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) -enum idle_type -{ - IDLE, - NOT_IDLE, - NEWLY_IDLE, - MAX_IDLE_TYPES -}; - -struct sched_domain; - /* * These are the runqueue data structures: */ @@ -228,140 +218,6 @@ struct runqueue { static DEFINE_PER_CPU(struct runqueue, runqueues); -/* - * sched-domains (multiprocessor balancing) declarations: - */ -#ifdef CONFIG_SMP -#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ - -#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 2 /* Balance on exec */ -#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */ - -struct sched_group { - struct sched_group *next; /* Must be a circular list */ - cpumask_t cpumask; - - /* - * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. This should be read only (except for setup). Although - * it will need to be written to at cpu hot(un)plug time, perhaps the - * cpucontrol semaphore will provide enough exclusion? - */ - unsigned long cpu_power; -}; - -struct sched_domain { - /* These fields must be setup */ - struct sched_domain *parent; /* top domain must be null terminated */ - struct sched_group *groups; /* the balancing groups of the domain */ - cpumask_t span; /* span of all CPUs in this domain */ - unsigned long min_interval; /* Minimum balance interval ms */ - unsigned long max_interval; /* Maximum balance interval ms */ - unsigned int busy_factor; /* less balancing by factor if busy */ - unsigned int imbalance_pct; /* No balance until over watermark */ - unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ - unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ - int flags; /* See SD_* */ - - /* Runtime fields. */ - unsigned long last_balance; /* init to jiffies. units in jiffies */ - unsigned int balance_interval; /* initialise to 1. units in ms. */ - unsigned int nr_balance_failed; /* initialise to 0 */ - -#ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ - unsigned long lb_cnt[MAX_IDLE_TYPES]; - unsigned long lb_failed[MAX_IDLE_TYPES]; - unsigned long lb_imbalance[MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[MAX_IDLE_TYPES]; - - /* sched_balance_exec() stats */ - unsigned long sbe_attempts; - unsigned long sbe_pushed; - - /* try_to_wake_up() stats */ - unsigned long ttwu_wake_affine; - unsigned long ttwu_wake_balance; -#endif -}; - -#ifndef ARCH_HAS_SCHED_TUNE -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_WAKE_IDLE -/* Common values for SMT siblings */ -#define SD_SIBLING_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 8, \ - .imbalance_pct = 110, \ - .cache_hot_time = 0, \ - .cache_nice_tries = 0, \ - .per_cpu_gain = 25, \ - .flags = SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif - -/* Common values for CPUs */ -#define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_hot_time = (5*1000/2), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - -/* Arch can override this macro in processor.h */ -#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT) -#define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_hot_time = (10*1000), \ - .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ - .flags = SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} -#endif -#endif /* ARCH_HAS_SCHED_TUNE */ -#endif - - #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) @@ -1017,8 +873,7 @@ static int wake_idle(int cpu, task_t *p) if (!(sd->flags & SD_WAKE_IDLE)) return cpu; - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) @@ -1532,8 +1387,7 @@ static int find_idlest_cpu(struct task_s min_cpu = UINT_MAX; min_load = ULONG_MAX; - cpus_and(mask, sd->span, cpu_online_map); - cpus_and(mask, mask, p->cpus_allowed); + cpus_and(mask, sd->span, p->cpus_allowed); for_each_cpu_mask(i, mask) { load = target_load(i); @@ -1619,8 +1473,8 @@ void sched_exec(void) if (tmp->flags & SD_BALANCE_EXEC) sd = tmp; - schedstat_inc(sd, sbe_attempts); if (sd) { + schedstat_inc(sd, sbe_attempts); new_cpu = find_idlest_cpu(current, this_cpu, sd); if (new_cpu != this_cpu) { schedstat_inc(sd, sbe_pushed); @@ -1789,7 +1643,6 @@ find_busiest_group(struct sched_domain * max_load = this_load = total_load = total_pwr = 0; do { - cpumask_t tmp; unsigned long load; int local_group; int i, nr_cpus = 0; @@ -1798,11 +1651,8 @@ find_busiest_group(struct sched_domain * /* Tally up the load of all CPUs in the group */ avg_load = 0; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (unlikely(cpus_empty(tmp))) - goto nextgroup; - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i); @@ -1921,13 +1771,11 @@ out_balanced: */ static runqueue_t *find_busiest_queue(struct sched_group *group) { - cpumask_t tmp; unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; - cpus_and(tmp, group->cpumask, cpu_online_map); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { load = source_load(i); if (load > max_load) { @@ -2124,18 +1972,13 @@ static void active_load_balance(runqueue group = sd->groups; do { - cpumask_t tmp; runqueue_t *rq; int push_cpu = 0; if (group == busy_group) goto next_group; - cpus_and(tmp, group->cpumask, cpu_online_map); - if (!cpus_weight(tmp)) - goto next_group; - - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, group->cpumask) { if (!idle_cpu(i)) goto next_group; push_cpu = i; @@ -2196,8 +2039,12 @@ static void rebalance_tick(int this_cpu, this_rq->cpu_load = (old_load + this_load) / 2; for_each_domain(this_cpu, sd) { - unsigned long interval = sd->balance_interval; + unsigned long interval; + + if (!sd->flags & SD_LOAD_BALANCE) + continue; + interval = sd->balance_interval; if (idle != IDLE) interval *= sd->busy_factor; @@ -2334,7 +2181,7 @@ static inline void wake_sleeping_depende */ spin_unlock(&this_rq->lock); - cpus_and(sibling_map, sd->span, cpu_online_map); + sibling_map = sd->span; for_each_cpu_mask(i, sibling_map) spin_lock(&cpu_rq(i)->lock); @@ -2379,7 +2226,7 @@ static inline int dependent_sleeper(int * wake_sleeping_dependent(): */ spin_unlock(&this_rq->lock); - cpus_and(sibling_map, sd->span, cpu_online_map); + sibling_map = sd->span; for_each_cpu_mask(i, sibling_map) spin_lock(&cpu_rq(i)->lock); cpu_clear(this_cpu, sibling_map); @@ -4016,16 +3863,17 @@ spinlock_t kernel_flag __cacheline_align EXPORT_SYMBOL(kernel_flag); #ifdef CONFIG_SMP -/* Attach the domain 'sd' to 'cpu' as its base domain */ -static void cpu_attach_domain(struct sched_domain *sd, int cpu) +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; runqueue_t *rq = cpu_rq(cpu); int local = 1; - lock_cpu_hotplug(); - spin_lock_irqsave(&rq->lock, flags); if (cpu == smp_processor_id() || !cpu_online(cpu)) { @@ -4044,128 +3892,10 @@ static void cpu_attach_domain(struct sch wake_up_process(rq->migration_thread); wait_for_completion(&req.done); } - - unlock_cpu_hotplug(); -} - -/* - * To enable disjoint top-level NUMA domains, define SD_NODES_PER_DOMAIN - * in arch code. That defines the number of nearby nodes in a node's top - * level scheduling domain. - */ -#if defined(CONFIG_NUMA) && defined(SD_NODES_PER_DOMAIN) -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int __init find_next_best_node(int node, unsigned long *used_nodes) -{ - int i, n, val, min_val, best_node = 0; - - min_val = INT_MAX; - - for (i = 0; i < numnodes; i++) { - /* Start at @node */ - n = (node + i) % numnodes; - - /* Skip already used nodes */ - if (test_bit(n, used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, i); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - set_bit(best_node, used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -cpumask_t __init sched_domain_node_span(int node) -{ - int i; - cpumask_t span; - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); - - for (i = 0; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); - cpumask_t nodemask; - - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); - } - - return span; -} -#else /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ -cpumask_t __init sched_domain_node_span(int node) -{ - return cpu_possible_map; -} -#endif /* CONFIG_NUMA && SD_NODES_PER_DOMAIN */ - -#ifdef CONFIG_SCHED_SMT -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static struct sched_group sched_group_cpus[NR_CPUS]; -__init static int cpu_to_cpu_group(int cpu) -{ - return cpu; -} -#endif - -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; -__init static int cpu_to_phys_group(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - return first_cpu(cpu_sibling_map[cpu]); -#else - return cpu; -#endif -} - -#ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -__init static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); } -#endif - -/* Groups for isolated scheduling domains */ -static struct sched_group sched_group_isolated[NR_CPUS]; /* cpus with isolated domains */ -cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE; - -__init static int cpu_to_isolated_group(int cpu) -{ - return cpu; -} +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4192,7 +3922,7 @@ __setup ("isolcpus=", isolated_cpu_setup * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -__init static void init_sched_build_groups(struct sched_group groups[], +void __devinit init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -4226,7 +3956,45 @@ __init static void init_sched_build_grou last->next = first; } -__init static void arch_init_sched_domains(void) + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); +#else +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static int __devinit cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void __devinit arch_init_sched_domains(void) { int i; cpumask_t cpu_default_map; @@ -4237,45 +4005,23 @@ __init static void arch_init_sched_domai * exclude other special cases in the future. */ cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_possible_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); - /* Set up domains */ - for_each_cpu(i) { + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(nodemask, nodemask, cpu_default_map); - /* - * Set up isolated domains. - * Unlike those of other cpus, the domains and groups are - * single level, and span a single cpu. - */ - if (cpu_isset(i, cpu_isolated_map)) { -#ifdef CONFIG_SCHED_SMT - sd = &per_cpu(cpu_domains, i); -#else - sd = &per_cpu(phys_domains, i); -#endif - group = cpu_to_isolated_group(i); - *sd = SD_CPU_INIT; - cpu_set(i, sd->span); - sd->balance_interval = INT_MAX; /* Don't balance */ - sd->flags = 0; /* Avoid WAKE_ */ - sd->groups = &sched_group_isolated[group]; - printk(KERN_INFO "Setting up cpu %d isolated.\n", i); - /* Single level, so continue with next cpu */ - continue; - } - #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - /* FIXME: should be multilevel, in arch code */ - sd->span = sched_domain_node_span(i); - cpus_and(sd->span, sd->span, cpu_default_map); + sd->span = cpu_default_map; sd->groups = &sched_group_nodes[group]; #endif @@ -4283,11 +4029,7 @@ __init static void arch_init_sched_domai sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); *sd = SD_CPU_INIT; -#ifdef CONFIG_NUMA sd->span = nodemask; -#else - sd->span = cpu_possible_map; -#endif sd->parent = p; sd->groups = &sched_group_phys[group]; @@ -4305,7 +4047,7 @@ __init static void arch_init_sched_domai #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu(i) { + for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); if (i != first_cpu(this_sibling_map)) @@ -4316,16 +4058,6 @@ __init static void arch_init_sched_domai } #endif - /* Set up isolated groups */ - for_each_cpu_mask(i, cpu_isolated_map) { - cpumask_t mask; - cpus_clear(mask); - cpu_set(i, mask); - init_sched_build_groups(sched_group_isolated, mask, - &cpu_to_isolated_group); - } - -#ifdef CONFIG_NUMA /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -4337,10 +4069,6 @@ __init static void arch_init_sched_domai init_sched_build_groups(sched_group_phys, nodemask, &cpu_to_phys_group); } -#else - init_sched_build_groups(sched_group_phys, cpu_possible_map, - &cpu_to_phys_group); -#endif #ifdef CONFIG_NUMA /* Set up node groups */ @@ -4373,7 +4101,7 @@ __init static void arch_init_sched_domai } /* Attach the domains */ - for_each_cpu(i) { + for_each_online_cpu(i) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -4384,21 +4112,27 @@ __init static void arch_init_sched_domai } } +static void __devinit arch_destroy_sched_domains(void) +{ + /* Do nothing: everything is statically allocated. */ +} + +#endif /* ARCH_HAS_SCHED_DOMAIN */ + #define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG -void sched_domain_debug(void) +static void sched_domain_debug(void) { int i; - for_each_cpu(i) { + for_each_online_cpu(i) { runqueue_t *rq = cpu_rq(i); struct sched_domain *sd; int level = 0; sd = rq->sd; - printk(KERN_DEBUG "CPU%d: %s\n", - i, (cpu_online(i) ? " online" : "offline")); + printk(KERN_DEBUG "CPU%d:\n", i); do { int j; @@ -4464,10 +4198,63 @@ void sched_domain_debug(void) #define sched_domain_debug() {} #endif +#ifdef CONFIG_SMP +/* + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. + */ +static struct sched_domain sched_domain_dummy; +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to a "dummy" domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int i; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(); + + sched_domain_debug(); + + return NOTIFY_OK; +} +#endif + void __init sched_init_smp(void) { + lock_cpu_hotplug(); arch_init_sched_domains(); sched_domain_debug(); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); } #else void __init sched_init_smp(void) @@ -4488,24 +4275,6 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; -#ifdef CONFIG_SMP - /* Set up an initial dummy domain for early boot */ - static struct sched_domain sched_domain_init; - static struct sched_group sched_group_init; - - memset(&sched_domain_init, 0, sizeof(struct sched_domain)); - sched_domain_init.span = CPU_MASK_ALL; - sched_domain_init.groups = &sched_group_init; - sched_domain_init.last_balance = jiffies; - sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ - sched_domain_init.busy_factor = 1; - - memset(&sched_group_init, 0, sizeof(struct sched_group)); - sched_group_init.cpumask = CPU_MASK_ALL; - sched_group_init.next = &sched_group_init; - sched_group_init.cpu_power = SCHED_LOAD_SCALE; -#endif - for (i = 0; i < NR_CPUS; i++) { prio_array_t *array; @@ -4515,7 +4284,7 @@ void __init sched_init(void) rq->expired = rq->arrays + 1; #ifdef CONFIG_SMP - rq->sd = &sched_domain_init; + rq->sd = &sched_domain_dummy; rq->cpu_load = 0; rq->active_balance = 0; rq->push_cpu = 0; _