From: "Siddha, Suresh B" Clean up the sched group cpu_power setup code, by introducing child field and new domain flag in sched_domain. Signed-off-by: Suresh Siddha Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/asm-i386/topology.h | 1 include/asm-ia64/topology.h | 2 include/asm-mips/mach-ip27/topology.h | 1 include/asm-powerpc/topology.h | 1 include/asm-x86_64/topology.h | 1 include/linux/sched.h | 12 + include/linux/topology.h | 46 +++++-- kernel/sched.c | 150 ++++++++++++------------ 8 files changed, 127 insertions(+), 87 deletions(-) diff -puN include/asm-i386/topology.h~sched-group-cpu-power-setup-cleanup include/asm-i386/topology.h --- a/include/asm-i386/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/asm-i386/topology.h @@ -74,6 +74,7 @@ static inline int node_to_first_cpu(int #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 8, \ .max_interval = 32, \ diff -puN include/asm-ia64/topology.h~sched-group-cpu-power-setup-cleanup include/asm-ia64/topology.h --- a/include/asm-ia64/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/asm-ia64/topology.h @@ -59,6 +59,7 @@ void build_cpu_to_node_map(void); #define SD_CPU_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ @@ -84,6 +85,7 @@ void build_cpu_to_node_map(void); #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 8, \ .max_interval = 8*(min(num_online_cpus(), 32)), \ diff -puN include/asm-mips/mach-ip27/topology.h~sched-group-cpu-power-setup-cleanup include/asm-mips/mach-ip27/topology.h --- a/include/asm-mips/mach-ip27/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/asm-mips/mach-ip27/topology.h @@ -22,6 +22,7 @@ extern unsigned char __node_distances[MA #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 8, \ .max_interval = 32, \ diff -puN include/asm-powerpc/topology.h~sched-group-cpu-power-setup-cleanup include/asm-powerpc/topology.h --- a/include/asm-powerpc/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/asm-powerpc/topology.h @@ -43,6 +43,7 @@ extern int pcibus_to_node(struct pci_bus #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 8, \ .max_interval = 32, \ diff -puN include/asm-x86_64/topology.h~sched-group-cpu-power-setup-cleanup include/asm-x86_64/topology.h --- a/include/asm-x86_64/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/asm-x86_64/topology.h @@ -31,6 +31,7 @@ extern int __node_distance(int, int); #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 8, \ .max_interval = 32, \ diff -puN include/linux/sched.h~sched-group-cpu-power-setup-cleanup include/linux/sched.h --- a/include/linux/sched.h~sched-group-cpu-power-setup-cleanup +++ a/include/linux/sched.h @@ -623,9 +623,16 @@ enum idle_type #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ +#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ -#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \ - ? SD_POWERSAVINGS_BALANCE : 0) +#define BALANCE_FOR_MC_POWER (sched_smt_power_savings ? \ + SD_POWERSAVINGS_BALANCE : 0) + +#define BALANCE_FOR_PKG_POWER \ + ((sched_mc_power_savings || sched_smt_power_savings) ? \ + SD_POWERSAVINGS_BALANCE : 0) + +#define test_sd_flag(sd, flag) ((sd && sd->flags & flag) ? 1 : 0) struct sched_group { @@ -642,6 +649,7 @@ struct sched_group { struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ diff -puN include/linux/topology.h~sched-group-cpu-power-setup-cleanup include/linux/topology.h --- a/include/linux/topology.h~sched-group-cpu-power-setup-cleanup +++ a/include/linux/topology.h @@ -89,6 +89,7 @@ #define SD_SIBLING_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 1, \ .max_interval = 2, \ @@ -114,11 +115,44 @@ #endif #endif /* CONFIG_SCHED_SMT */ +#ifdef CONFIG_SCHED_MC +/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ +#ifndef SD_MC_INIT +#define SD_MC_INIT (struct sched_domain) { \ + .span = CPU_MASK_NONE, \ + .parent = NULL, \ + .child = NULL, \ + .groups = NULL, \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .per_cpu_gain = 100, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ + | SD_SHARE_PKG_RESOURCES\ + | BALANCE_FOR_MC_POWER, \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ +} +#endif +#endif /* CONFIG_SCHED_MC */ + /* Common values for CPUs */ #ifndef SD_CPU_INIT #define SD_CPU_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ @@ -135,7 +169,7 @@ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | BALANCE_FOR_POWER, \ + | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -146,6 +180,7 @@ #define SD_ALLNODES_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ + .child = NULL, \ .groups = NULL, \ .min_interval = 64, \ .max_interval = 64*num_online_cpus(), \ @@ -165,15 +200,6 @@ .nr_balance_failed = 0, \ } -#ifdef CONFIG_SCHED_MC -#ifndef SD_MC_INIT -/* for now its same as SD_CPU_INIT. - * TBD: Tune Domain parameters! - */ -#define SD_MC_INIT SD_CPU_INIT -#endif -#endif - #ifdef CONFIG_NUMA #ifndef SD_NODE_INIT #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! diff -puN kernel/sched.c~sched-group-cpu-power-setup-cleanup kernel/sched.c --- a/kernel/sched.c~sched-group-cpu-power-setup-cleanup +++ a/kernel/sched.c @@ -1273,7 +1273,6 @@ static int sched_balance_self(int cpu, i cpumask_t span; struct sched_group *group; int new_cpu; - int weight; span = sd->span; group = find_idlest_group(sd, t, cpu); @@ -1287,14 +1286,9 @@ static int sched_balance_self(int cpu, i /* Now try balancing at a lower domain level */ cpu = new_cpu; nextlevel: - sd = NULL; - weight = cpus_weight(span); - for_each_domain(cpu, tmp) { - if (weight <= cpus_weight(tmp->span)) - break; - if (tmp->flags & flag) - sd = tmp; - } + sd = sd->child; + if (sd && sd->flags & flag) + goto nextlevel; /* while loop will break here if sd == NULL */ } @@ -2499,8 +2493,14 @@ static int load_balance(int this_cpu, st unsigned long imbalance; struct rq *busiest; + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_flag(sd->parent, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2590,7 +2590,7 @@ static int load_balance(int this_cpu, st } if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_flag(sd->parent, SD_POWERSAVINGS_BALANCE)) return -1; return nr_moved; @@ -2606,7 +2606,7 @@ out_one_pinned: sd->balance_interval *= 2; if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !sched_smt_power_savings) + !test_sd_flag(sd->parent, SD_POWERSAVINGS_BALANCE)) return -1; return 0; } @@ -2627,7 +2627,14 @@ load_balance_newidle(int this_cpu, struc int nr_moved = 0; int sd_idle = 0; - if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) + /* + * When power savings policy is enabled for the parent domain, idle + * sibling can pick up load irrespective of busy siblings. In this case, + * let the state of idle sibling percolate up as IDLE, instead of + * portraying it as NOT_IDLE. + */ + if (sd->flags & SD_SHARE_CPUPOWER && sd->parent && + !(sd->parent->flags & SD_POWERSAVINGS_BALANCE)) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2659,7 +2666,8 @@ load_balance_newidle(int this_cpu, struc if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !test_sd_flag(sd->parent, SD_POWERSAVINGS_BALANCE)) return -1; } else sd->nr_balance_failed = 0; @@ -5329,7 +5337,9 @@ static int sd_degenerate(struct sched_do if (sd->flags & (SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5363,7 +5373,9 @@ sd_parent_degenerate(struct sched_domain pflags &= ~(SD_LOAD_BALANCE | SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | - SD_BALANCE_EXEC); + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); } if (~cflags & pflags) return 0; @@ -5385,12 +5397,18 @@ static void cpu_attach_domain(struct sch struct sched_domain *parent = tmp->parent; if (!parent) break; - if (sd_parent_degenerate(tmp, parent)) + if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + } } - if (sd && sd_degenerate(sd)) + if (sd && sd_degenerate(sd)) { sd = sd->parent; + if(sd) + sd->child = NULL; + } sched_domain_debug(sd, cpu); @@ -6163,6 +6181,31 @@ next_sg: } } +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_domain *child; + struct sched_group *group; + + if (!sd || !sd->groups || (cpu != first_cpu(sd->groups->cpumask))) + return; + + child = sd->child; + + if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && + (child->flags & SD_SHARE_CPUPOWER || + child->flags & SD_SHARE_PKG_RESOURCES))) { + sd->groups->cpu_power = SCHED_LOAD_SCALE; + return; + } + + sd->groups->cpu_power = 0; + group = child->groups; + do { + sd->groups->cpu_power += group->cpu_power; + group = group->next; + } while (group != child->groups); +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6171,6 +6214,7 @@ static int build_sched_domains(const cpu { int i; struct sched_group *sched_group_phys = NULL; + struct sched_domain *sd; #ifdef CONFIG_SCHED_MC struct sched_group *sched_group_core = NULL; #endif @@ -6229,6 +6273,8 @@ static int build_sched_domains(const cpu *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + if (p) + p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); #endif @@ -6250,6 +6296,8 @@ static int build_sched_domains(const cpu *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + if (p) + p->child = sd; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC @@ -6272,6 +6320,7 @@ static int build_sched_domains(const cpu sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_core[group]; #endif @@ -6283,6 +6332,7 @@ static int build_sched_domains(const cpu sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + p->child = sd; sd->groups = &sched_group_cpus[group]; #endif } @@ -6401,77 +6451,27 @@ static int build_sched_domains(const cpu /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; sd = &per_cpu(cpu_domains, i); - sd->groups->cpu_power = SCHED_LOAD_SCALE; + init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu_mask(i, *cpu_map) { - int power; - struct sched_domain *sd; sd = &per_cpu(core_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) - * SCHED_LOAD_SCALE / 10; - sd->groups->cpu_power = power; + init_sched_groups_power(i, sd); } #endif for_each_cpu_mask(i, *cpu_map) { - struct sched_domain *sd; -#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); - if (i != first_cpu(sd->groups->cpumask)) - continue; - - sd->groups->cpu_power = 0; - if (sched_mc_power_savings || sched_smt_power_savings) { - int j; - - for_each_cpu_mask(j, sd->groups->cpumask) { - struct sched_domain *sd1; - sd1 = &per_cpu(core_domains, j); - /* - * for each core we will add once - * to the group in physical domain - */ - if (j != first_cpu(sd1->groups->cpumask)) - continue; - - if (sched_smt_power_savings) - sd->groups->cpu_power += sd1->groups->cpu_power; - else - sd->groups->cpu_power += SCHED_LOAD_SCALE; - } - } else - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; -#else - int power; - sd = &per_cpu(phys_domains, i); - if (sched_smt_power_savings) - power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); - else - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; -#endif + init_sched_groups_power(i, sd); } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + for_each_cpu_mask(i, *cpu_map) { + sd = &per_cpu(node_domains, i); + init_sched_groups_power(i, sd); + } init_numa_sched_groups_power(sched_group_allnodes); #endif _