diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu new file mode 100644 index 0000000..32dde5b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -0,0 +1,41 @@ +What: /sys/devices/system/cpu/... +Date: Feb. 2008 +KernelVersion: 2.6.24 +Contact: LKML +Description: + +The /sys/devices/system/cpu tree provides information about all cpu's +known to the running kernel. + +Following files are created for each cpu. 'N' is the cpu number. + +/sys/devices/system/cpu/cpuN/ + online (0644) On-line attribute. Indicates whether the cpu is on-line. + The cpu can be brought off-line by writing '0' into + this file. Similarly it can be brought back on-line + by writing '1' into this file. This attribute is + not available for the cpu's that cannot be brought + off-line. Typically cpu0. For more information see + Documentation/cpu-hotplug.txt + + isolated (0644) Isolation attribute. Indicates whether the cpu + is isolated. + The cpu can be isolated by writing '1' into this + file. Similarly it can be un-isolated by writing + '0' into this file. In order to isolate the cpu it + must first be brought off-line. This attribute is + not available for the cpu's that cannot be brought + off-line. Typically cpu0. + Note this attribute is present only if "CPU isolation" + is enabled. For more information see + Documentation/cpu-isolation.txt + + cpufreq (0755) Frequency scaling state. + For more info see + Documentation/cpu-freq/... + + cache (0755) Cache information. FIXME + + cpuidle (0755) Idle state information. FIXME + + topology (0755) Topology information. FIXME diff --git a/Documentation/cpu-isolation.txt b/Documentation/cpu-isolation.txt new file mode 100644 index 0000000..b9ca425 --- /dev/null +++ b/Documentation/cpu-isolation.txt @@ -0,0 +1,113 @@ +CPU isolation support in Linux(tm) Kernel + +Maintainers: + +Scheduler and scheduler domain bits: + Ingo Molnar + +General framework, irq and workqueue isolation: + Max Krasnyanskiy + +ChangeLog: +- Initial version. Feb 2008, MaxK + +Introduction +------------ + +The primary idea behind CPU isolation is the ability to use some CPU cores +as a dedicated engines for running user-space code with minimal kernel +overhead/intervention, think of it as an SPE in the Cell processor. For +example CPU isolation allows for running CPU intensive(100%) RT task +on one of the processors without adversely affecting or being affected +by the other system activities. With the current (as of early 2008) +multi-core CPU trend we may see more and more applications that explore +this capability: real-time gaming engines, simulators, hard real-time +apps, etc. + +Current CPU isolation support consists of the following features: + +1. Isolated CPU(s) are excluded from the scheduler load balancing logic. + Applications must explicitly bind threads in order to run on those + CPU(s). + +2. By default interrupts are not routed to the isolated CPU(s). + Users must route interrupts (if any) to those CPU(s) explicitly. + +3. Kernel avoids any activity on the isolated CPU(s) as much as possible. + This includes workqueues, per CPU threads, etc. Please note that + this feature is optional and is disabled by default. + +Kernel configuration options +---------------------------- + +Following options need to be enabled in order to use CPU isolation + CONFIG_CPUISOL Top-level config option. Enables general + CPU isolation framework and enables features + #1 and #2 described above. + + CONFIG_CPUISOL_WORKQUEUE These options provide deeper isolation + CONFIG_CPUISOL_STOPMACHINE from various kernel subsystems. They implement + CONFIG_CPUISOL_... feature #3 described above. + See Kconfig help for more information on each + individual option. + +How to isolate a CPU +-------------------- + +There are two ways for isolating a CPU + +Kernel boot command line: + isolcpus=n0,n1,... + + This option enables isolation for all CPU(s) specified in the comma + separated list. + Example: + isolcpus=1,5 + +Sysfs interface: + In order to isolate a CPU through this mechanism it must first be + brought off-line. Hence the command sequence looks like: + echo 0 > /sys/devices/system/cpu/cpuN/online + echo 1 > /sys/devices/system/cpu/cpuN/isolated + echo 1 > /sys/devices/system/cpu/cpuN/online + + +Kernel interfaces +----------------- + +cpu_isolated_map - Bitmap of all isolated CPU(s). + This bitmap is updated either by isolcpus= command + line option or by writing to sysfs attributes. In + the latter case isolation bit can be updated only + when the corresponding CPU is off-line. + +cpu_isolated(N) - Use that function to check whether CPU N is isolated. + + +User-space application interfaces +--------------------------------- + +From the user-space application perspective isolated CPU(s) are no +different from any other CPU in the system. Once the required CPU(s) +have been isolated applications can use standard POSIX apis to bind its +threads to those CPU(s): + sched_setaffinity + sched_getaffinity + pthread_getaffinity_np + pthread_setaffinity_np + +One thing to keep in mind though is that Linux kernel heavily uses per-CPU +data structures and mechanisms. For example if a thread running on an +isolated CPU makes a system call, that system call will be serviced +on the same CPU. Some system calls trigger timers, softirq and other +kernel activities. Those will likely also run on the same CPU where +system call was originally serviced. If this behavior is undesirable +threads running on the isolated CPU(s) should avoid using certain system +calls and instead delegate that to the threads running on non-isolated +CPU(s). For example you probably do not want to write to the filesystem +or the console. Generally, time, mutex and pthread related system calls +are perfectly safe. + +If you intend to use CPU isolation for real-time applications please +check out rt wiki at + http://rt.wiki.kernel.org/index.php/Main_Page diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index facf82a..6b01702 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -51,7 +51,7 @@ select_smp_affinity(unsigned int irq) if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq]) return 1; - while (!cpu_possible(cpu)) + while (!cpu_possible(cpu) || cpu_isolated(cpu)) cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); last_cpu = cpu; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80b7ba4..b8f986e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -495,6 +495,7 @@ config SCHED_MC increased overhead in some places. If unsure say N here. source "kernel/Kconfig.preempt" +source "kernel/Kconfig.cpuisol" config X86_UP_APIC bool "Local APIC support on uniprocessors" diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a6b1490..6ec8580 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -469,7 +469,7 @@ static void do_irq_balance(void) for_each_possible_cpu(i) { int package_index; CPU_IRQ(i) = 0; - if (!cpu_online(i)) + if (!cpu_online(i) || cpu_isolated(i)) continue; package_index = CPU_TO_PACKAGEINDEX(i); for (j = 0; j < NR_IRQS; j++) { diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 4054507..f5054c0 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -55,10 +55,58 @@ static ssize_t store_online(struct sys_device *dev, const char *buf, } static SYSDEV_ATTR(online, 0644, show_online, store_online); +#ifdef CONFIG_CPUISOL +/* + * This is under config hotplug because in order to + * dynamically isolate a CPU it needs to be brought off-line first. + * In other words the sequence is + * echo 0 > /sys/device/system/cpuN/online + * echo 1 > /sys/device/system/cpuN/isolated + * echo 1 > /sys/device/system/cpuN/online + */ +static ssize_t show_isol(struct sys_device *dev, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + + return sprintf(buf, "%u\n", !!cpu_isolated(cpu->sysdev.id)); +} + +static ssize_t store_isol(struct sys_device *dev, const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + ssize_t ret = 0; + + if (cpu_online(cpu->sysdev.id)) + return -EBUSY; + + switch (buf[0]) { + case '0': + cpu_clear(cpu->sysdev.id, cpu_isolated_map); + break; + case '1': + cpu_set(cpu->sysdev.id, cpu_isolated_map); + break; + default: + ret = -EINVAL; + } + + if (ret >= 0) + ret = count; + return ret; +} +static SYSDEV_ATTR(isolated, 0600, show_isol, store_isol); +#endif /* CONFIG_CPUISOL */ + static void __devinit register_cpu_control(struct cpu *cpu) { sysdev_create_file(&cpu->sysdev, &attr_online); + +#ifdef CONFIG_CPUISOL + sysdev_create_file(&cpu->sysdev, &attr_isolated); +#endif } + void unregister_cpu(struct cpu *cpu) { int logical_cpu = cpu->sysdev.id; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 85bd790..84d1561 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -380,6 +380,7 @@ static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp, extern cpumask_t cpu_possible_map; extern cpumask_t cpu_online_map; extern cpumask_t cpu_present_map; +extern cpumask_t cpu_isolated_map; #if NR_CPUS > 1 #define num_online_cpus() cpus_weight(cpu_online_map) @@ -388,6 +389,7 @@ extern cpumask_t cpu_present_map; #define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) #define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map) #define cpu_present(cpu) cpu_isset((cpu), cpu_present_map) +#define cpu_isolated(cpu) cpu_isset((cpu), cpu_isolated_map) #else #define num_online_cpus() 1 #define num_possible_cpus() 1 @@ -395,6 +397,7 @@ extern cpumask_t cpu_present_map; #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) +#define cpu_isolated(cpu) (0) #endif #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) diff --git a/include/linux/irq.h b/include/linux/irq.h index 4669be0..42d119e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -253,14 +253,7 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) } #endif -#ifdef CONFIG_AUTO_IRQ_AFFINITY extern int select_smp_affinity(unsigned int irq); -#else -static inline int select_smp_affinity(unsigned int irq) -{ - return 1; -} -#endif extern int no_irq_affinity; diff --git a/kernel/Kconfig.cpuisol b/kernel/Kconfig.cpuisol new file mode 100644 index 0000000..24c1ef0 --- /dev/null +++ b/kernel/Kconfig.cpuisol @@ -0,0 +1,42 @@ +config CPUISOL + depends on SMP + bool "CPU isolation" + help + This option enables support for CPU isolation. If enabled the + kernel will try to avoid kernel activity on the isolated CPUs. + By default user-space threads are not scheduled on the isolated + CPUs unless they explicitly request it via sched_setaffinity() + and pthread_setaffinity_np() calls. Isolated CPUs are not + subject to the scheduler load-balancing algorithms. + + This feature is useful for hard realtime and high performance + applications. + See Documentation/cpu-isolation.txt for more details. + + If unsure say 'N'. + +config CPUISOL_WORKQUEUE + bool "Do not schedule workqueues on the isolated CPUs (EXPERIMENTAL)" + depends on CPUISOL && EXPERIMENTAL + help + If this option is enabled kernel will not schedule workqueues on + the isolated CPUs. Please note that at this point this feature + is experimental. It breaks certain things like OProfile that + heavily rely on per cpu workqueues. + + Say 'Y' to enable workqueue isolation. If unsure say 'N'. + +config CPUISOL_STOPMACHINE + bool "Do not halt isolated CPUs with Stop Machine (EXPERIMENTAL)" + depends on CPUISOL && STOP_MACHINE && EXPERIMENTAL + help + If this option is enabled kernel will not halt isolated CPUs + when Stop Machine is triggered. Stop Machine is currently only + used by the module insertion and removal. + Please note that at this point this feature is experimental. It is + not known to really break anything but can potentially introduce + an instability due to race conditions in module removal logic. + + Say 'Y' if support for dynamic module insertion and removal is + required for the system that uses isolated CPUs. + If unsure say 'N'. diff --git a/kernel/Makefile b/kernel/Makefile index dfa9695..d7716a9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ - exit.o itimer.o time.o softirq.o resource.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c1..2f52838 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,8 +15,40 @@ #include #include +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +/* + * Represents isolated cpu's. + * In general any kernel activity should be avoided as much as possible + * on these cpu's. Isolated cpu's are not load balanced by the scheduler. + */ +cpumask_t cpu_isolated_map __read_mostly = CPU_MASK_NONE; +EXPORT_SYMBOL(cpu_isolated_map); + +#ifndef CONFIG_SMP + +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); + +#endif + +#ifdef CONFIG_SMP + /* This protects CPUs going up and down... */ static DEFINE_MUTEX(cpu_add_remove_lock); + +/* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_bitmask_lock); static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); @@ -347,3 +379,27 @@ out: mutex_unlock(&cpu_add_remove_lock); } #endif /* CONFIG_PM_SLEEP_SMP */ + +#ifdef CONFIG_CPUISOL +/* Setup the mask of isolated cpus */ + +static int __initdata isolcpu[NR_CPUS]; + +static int __init isolated_cpu_setup(char *str) +{ + int i, n; + + str = get_options(str, ARRAY_SIZE(isolcpu), isolcpu); + n = isolcpu[0]; + + cpus_clear(cpu_isolated_map); + for (i = 1; i <= n; i++) + if (isolcpu[i] < NR_CPUS) + cpu_set(isolcpu[i], cpu_isolated_map); + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); +#endif + +#endif /* CONFIG_SMP */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f31422..d66f4c7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -376,6 +376,9 @@ int setup_irq(unsigned int irq, struct irqaction *new) } else /* Undo nested disables: */ desc->depth = 1; + + /* Set default affinity mask once everything is setup */ + select_smp_affinity(irq); } /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; @@ -485,6 +488,26 @@ void free_irq(unsigned int irq, void *dev_id) } EXPORT_SYMBOL(free_irq); +#ifndef CONFIG_AUTO_IRQ_AFFINITY +/** + * Generic version of the affinity autoselector. + * Called under desc->lock from setup_irq(). + * btw Should we rename this to select_irq_affinity() ? + */ +int select_smp_affinity(unsigned int irq) +{ + cpumask_t usable_cpus; + + if (!irq_can_set_affinity(irq)) + return 0; + + cpus_andnot(usable_cpus, cpu_online_map, cpu_isolated_map); + irq_desc[irq].affinity = usable_cpus; + irq_desc[irq].chip->set_affinity(irq, usable_cpus); + return 0; +} +#endif + /** * request_irq - allocate an interrupt line * @irq: Interrupt line to allocate @@ -552,8 +575,6 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; - select_smp_affinity(irq); - #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* diff --git a/kernel/sched.c b/kernel/sched.c index e76b11c..ef39bfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4570,24 +4570,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return sched_setaffinity(pid, new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; @@ -5863,24 +5845,6 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) rcu_assign_pointer(rq->sd, sd); } -/* cpus with isolated domains */ -static cpumask_t cpu_isolated_map = CPU_MASK_NONE; - -/* Setup the mask of cpus configured for isolated domains */ -static int __init isolated_cpu_setup(char *str) -{ - int ints[NR_CPUS], i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - cpus_clear(cpu_isolated_map); - for (i = 1; i <= ints[0]; i++) - if (ints[i] < NR_CPUS) - cpu_set(ints[i], cpu_isolated_map); - return 1; -} - -__setup("isolcpus=", isolated_cpu_setup); - /* * init_sched_build_groups takes the cpumask we wish to span, and a pointer * to a function which identifies what group(along with sched group) a CPU diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821e..85214bf 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -90,6 +90,12 @@ static void stopmachine_set_state(enum stopmachine_state state) cpu_relax(); } +#ifdef CONFIG_CPUISOL_STOPMACHINE +#define cpu_unusable(cpu) cpu_isolated(cpu) +#else +#define cpu_unusable(cpu) (0) +#endif + static int stop_machine(void) { int i, ret = 0; @@ -99,7 +105,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) + if (i == raw_smp_processor_id() || cpu_unusable(i)) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8db0b59..58766c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -35,6 +35,16 @@ #include /* + * Stub out cpu_isolated() if isolated CPUs are allowed to + * run workqueues. + */ +#ifdef CONFIG_CPUISOL_WORKQUEUE +#define cpu_unusable(cpu) cpu_isolated(cpu) +#else +#define cpu_unusable(cpu) (0) +#endif + +/* * The per-CPU workqueue (if single thread, we always use the first * possible cpu). */ @@ -98,7 +108,7 @@ static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) static struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) { - if (unlikely(is_single_threaded(wq))) + if (unlikely(is_single_threaded(wq)) || cpu_unusable(cpu)) cpu = singlethread_cpu; return per_cpu_ptr(wq->cpu_wq, cpu); } @@ -230,9 +240,11 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; - if (unlikely(cpu >= 0)) + if (unlikely(cpu >= 0)) { + if (cpu_unusable(cpu)) + cpu = singlethread_cpu; add_timer_on(timer, cpu); - else + } else add_timer(timer); ret = 1; } @@ -608,7 +620,8 @@ int schedule_on_each_cpu(work_func_t func) preempt_disable(); /* CPU hotplug */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); - + if (cpu_unusable(cpu)) + continue; INIT_WORK(work, func); set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); @@ -755,7 +768,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, for_each_possible_cpu(cpu) { cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) + if (err || !cpu_online(cpu) || cpu_unusable(cpu)) continue; err = create_workqueue_thread(cwq, cpu); start_workqueue_thread(cwq, cpu); @@ -832,8 +845,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - action &= ~CPU_TASKS_FROZEN; + if (cpu_unusable(cpu)) + return NOTIFY_OK; + action &= ~CPU_TASKS_FROZEN; + switch (action) { case CPU_LOCK_ACQUIRE: mutex_lock(&workqueue_mutex); @@ -874,7 +890,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, void __init init_workqueues(void) { - cpu_populated_map = cpu_online_map; + cpus_andnot(cpu_populated_map, cpu_online_map, cpu_isolated_map); singlethread_cpu = first_cpu(cpu_possible_map); cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); hotcpu_notifier(workqueue_cpu_callback, 0);