diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt index 938d7dd..5f07fa5 100644 --- a/Documentation/IRQ-affinity.txt +++ b/Documentation/IRQ-affinity.txt @@ -1,17 +1,26 @@ +ChangeLog: + Started by Ingo Molnar + Update by Max Krasnyansky -SMP IRQ affinity, started by Ingo Molnar - +SMP IRQ affinity /proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed to turn off all CPUs, and if an IRQ controller does not support IRQ affinity then the value will not change from the default 0xffffffff. +/proc/irq/default_smp_affinity specifies default affinity mask that applies +to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask +will be set to the default mask. It can then be changed as described above. +Default mask is 0xffffffff. + Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting -the IRQ to CPU4-7 (this is an 8-CPU SMP box): +it to CPU4-7 (this is an 8-CPU SMP box): +[root@moon 44]# cd /proc/irq/44 [root@moon 44]# cat smp_affinity ffffffff + [root@moon 44]# echo 0f > smp_affinity [root@moon 44]# cat smp_affinity 0000000f @@ -21,17 +30,27 @@ PING hell (195.4.7.3): 56 data bytes --- hell ping statistics --- 6029 packets transmitted, 6027 packets received, 0% packet loss round-trip min/avg/max = 0.1/0.1/0.4 ms -[root@moon 44]# cat /proc/interrupts | grep 44: - 44: 0 1785 1785 1783 1783 1 -1 0 IO-APIC-level eth1 +[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:' + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 + 44: 1068 1785 1785 1783 0 0 0 0 IO-APIC-level eth1 + +As can be seen from the line above IRQ44 was delivered only to the first four +processors (0-3). +Now lets restrict that IRQ to CPU(4-7). + [root@moon 44]# echo f0 > smp_affinity +[root@moon 44]# cat smp_affinity +000000f0 [root@moon 44]# ping -f h PING hell (195.4.7.3): 56 data bytes .. --- hell ping statistics --- 2779 packets transmitted, 2777 packets received, 0% packet loss round-trip min/avg/max = 0.1/0.5/585.4 ms -[root@moon 44]# cat /proc/interrupts | grep 44: - 44: 1068 1785 1785 1784 1784 1069 1070 1069 IO-APIC-level eth1 -[root@moon 44]# +[root@moon 44]# cat /proc/interrupts | 'CPU\|44:' + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 + 44: 1068 1785 1785 1783 1784 1069 1070 1069 IO-APIC-level eth1 + +This time around IRQ44 was delivered only to the last four processors. +i.e counters for the CPU0-3 did not change. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 518ebe6..a3df5df 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -379,28 +379,35 @@ i386 and x86_64 platforms support the new IRQ vector displays. Of some interest is the introduction of the /proc/irq directory to 2.4. It could be used to set IRQ to CPU affinity, this means that you can "hook" an IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the -irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask +irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and +prof_cpu_mask. For example > ls /proc/irq/ 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask - 1 11 13 15 17 19 3 5 7 9 + 1 11 13 15 17 19 3 5 7 9 default_smp_affinity > ls /proc/irq/0/ smp_affinity -The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ -is the same by default: +smp_affinity is a bitmask, in which you can specify which CPUs can handle the +IRQ, you can set it by doing: - > cat /proc/irq/0/smp_affinity - ffffffff + > echo 1 > /proc/irq/10/smp_affinity + +This means that only the first CPU will handle the IRQ, but you can also echo +5 which means that only the first and fourth CPU can handle the IRQ. -It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can -set it by doing: +The contents of each smp_affinity file is the same by default: + + > cat /proc/irq/0/smp_affinity + ffffffff - > echo 1 > /proc/irq/prof_cpu_mask +The default_smp_affinity mask applies to all non-active IRQs, which are the +IRQs which have not yet been allocated/activated, and hence which lack a +/proc/irq/[0-9]* directory. -This means that only the first CPU will handle the IRQ, but you can also echo 5 -which means that only the first and fourth CPU can handle the IRQ. +prof_cpu_mask specifies which CPUs are to be profiled by the system wide +profiler. Default value is ffffffff (all cpus). The way IRQs are routed is handled by the IO-APIC, and it's Round Robin between all the CPUs which are allowed to handle it. As usual the kernel has diff --git a/Makefile b/Makefile index 6cf1113..8d6d660 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 25 -EXTRAVERSION = .6 +EXTRAVERSION = .6.cpuisol1 NAME = Funky Weasel is Jiggy wit it # *DOCUMENTATION* diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index facf82a..c626a82 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -42,8 +42,7 @@ void ack_bad_irq(unsigned int irq) #ifdef CONFIG_SMP static char irq_user_affinity[NR_IRQS]; -int -select_smp_affinity(unsigned int irq) +int irq_select_affinity(unsigned int irq) { static int last_cpu; int cpu = last_cpu + 1; @@ -51,7 +50,7 @@ select_smp_affinity(unsigned int irq) if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq]) return 1; - while (!cpu_possible(cpu)) + while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity)) cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); last_cpu = cpu; diff --git a/include/linux/irq.h b/include/linux/irq.h index 176e5e7..8faebf8 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -228,8 +228,11 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_GENERIC_PENDING_IRQ */ +extern cpumask_t irq_default_affinity; + extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask); extern int irq_can_set_affinity(unsigned int irq); +extern int irq_select_affinity(unsigned int irq); #else /* CONFIG_SMP */ @@ -243,6 +246,8 @@ static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask) static inline int irq_can_set_affinity(unsigned int irq) { return 0; } +static inline int irq_select_affinity(unsigned int irq) { return 0; } + #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQBALANCE @@ -253,15 +258,6 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) } #endif -#ifdef CONFIG_AUTO_IRQ_AFFINITY -extern int select_smp_affinity(unsigned int irq); -#else -static inline int select_smp_affinity(unsigned int irq) -{ - return 1; -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 571f01d..832e4f2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -976,6 +976,8 @@ extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, struct file *, void __user *, size_t *, loff_t *); +extern int proc_docpumask(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); extern int do_sysctl (int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f4..29c6304 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1789,6 +1789,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 438a014..80c7573 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,8 @@ #ifdef CONFIG_SMP +cpumask_t irq_default_affinity = CPU_MASK_ALL; + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -94,6 +96,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) return 0; } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +/* + * Generic version of the affinity autoselector. + */ +int irq_select_affinity(unsigned int irq) +{ + cpumask_t mask; + + if (!irq_can_set_affinity(irq)) + return 0; + + cpus_and(mask, cpu_online_map, irq_default_affinity); + + irq_desc[irq].affinity = mask; + irq_desc[irq].chip->set_affinity(irq, mask); + + set_balance_irq_affinity(irq, mask); + return 0; +} +#endif + #endif /** @@ -376,6 +399,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) } else /* Undo nested disables: */ desc->depth = 1; + + /* Set default affinity mask once everything is setup */ + irq_select_affinity(irq); } /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -555,8 +581,6 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; - select_smp_affinity(irq); - #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c2f2ccb..6c6d35d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned long count, void *data) { unsigned int irq = (int)(long)data, full_count = count, err; - cpumask_t new_value, tmp; + cpumask_t new_value; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ - cpus_and(tmp, new_value, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return select_smp_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : full_count; irq_set_affinity(irq, new_value); return full_count; } +static int default_affinity_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len = cpumask_scnprintf(page, count, irq_default_affinity); + if (count - len < 2) + return -EINVAL; + len += sprintf(page + len, "\n"); + return len; +} + +static int default_affinity_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int full_count = count, err; + cpumask_t new_value; + + err = cpumask_parse_user(buffer, count, new_value); + if (err) + return err; + + if (!is_affinity_mask_valid(new_value)) + return -EINVAL; + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + if (!cpus_intersects(new_value, cpu_online_map)) + return -EINVAL; + + irq_default_affinity = new_value; + + return full_count; +} #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) remove_proc_entry(action->dir->name, irq_desc[irq].dir); } +void register_default_affinity_proc(void) +{ +#ifdef CONFIG_SMP + struct proc_dir_entry *entry; + + /* create /proc/irq/default_smp_affinity */ + entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); + if (entry) { + entry->data = NULL; + entry->read_proc = default_affinity_read; + entry->write_proc = default_affinity_write; + } +#endif +} + void init_irq_proc(void) { int i; @@ -180,6 +229,8 @@ void init_irq_proc(void) if (!root_irq_dir) return; + register_default_affinity_proc(); + /* * Create entries for all existing IRQs. */ diff --git a/kernel/sched.c b/kernel/sched.c index 1e4596c..d3d202d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6918,6 +6918,19 @@ void __attribute__((weak)) arch_update_cpu_topology(void) } /* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) { + kfree(doms_cur); + doms_cur = &fallback_doms; + } +} + +/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. @@ -7037,6 +7050,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); put_online_cpus(); @@ -7121,6 +7135,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7139,8 +7154,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e1..e704689 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -30,6 +30,9 @@ static enum stopmachine_state stopmachine_state; static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; +/* Mask of the CPUs the stopmachine is allowed to stop */ +cpumask_t sysctl_stopmachine_cpus = CPU_MASK_ALL; + static int stopmachine(void *cpu) { int irqs_disabled = 0; @@ -100,6 +103,8 @@ static int stop_machine(void) for_each_online_cpu(i) { if (i == raw_smp_processor_id()) continue; + if (!cpu_isset(i, sysctl_stopmachine_cpus)) + continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d68..e424d16 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,6 +80,7 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; +extern cpumask_t sysctl_stopmachine_cpus; /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -399,6 +400,17 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + +#ifdef CONFIG_STOP_MACHINE + { + .procname = "stopmachine_cpus", + .data = &sysctl_stopmachine_cpus, + .maxlen = sizeof(cpumask_t), + .mode = 0644, + .proc_handler = &proc_docpumask, + }, +#endif + #ifdef CONFIG_BLK_DEV_INITRD { .ctl_name = KERN_REALROOTDEV, @@ -2484,6 +2496,61 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp return 0; } +static int _proc_do_cpumask(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + cpumask_t *mask = (cpumask_t *) data; + ssize_t len; + int err; + + if (!data || !maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + if (write) { + err = cpumask_parse_user(buffer, *lenp, *mask); + if (err) + return err; + *ppos += *lenp; + } else { + char *page = (char *) __get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + len = cpumask_scnprintf(page, PAGE_SIZE, *mask); + len = simple_read_from_buffer(buffer, *lenp, ppos, page, len); + + free_page((unsigned long) page); + + if (len < 0) + return len; + *lenp = len; + } + return 0; +} + +/** + * proc_docpumask - read/write a cpumask sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a cpumask from/to the user buffer. + * + * Returns 0 on success. + */ +int proc_docpumask(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_cpumask(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + #else /* CONFIG_PROC_FS */ int proc_dostring(struct ctl_table *table, int write, struct file *filp, @@ -2536,6 +2603,12 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_docpumask(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + + return -ENOSYS; +} #endif /* CONFIG_PROC_FS */